1. 利用GO 爬取小说

 package main

import (
    "bookget/CrawlService"
    "bookget/Helper"
    "encoding/json"
    "fmt"
    "os"
    "strconv"
    "time"
)

func main() {
    fmt.Printf("Start ==> ")

    uri := "https://www.biquge18.com/book/73308/1742.html";
    FileName := "test1.txt";
    content,err:= Helper.HttpGet(uri)
    if(err != nil){
        os.Exit(1)
    }else{
        r1,ekey := CrawlService.GetEkey(content)
        fmt.Print(ekey)
        if(r1){
            createFile(FileName);
            bookDataHanle("1742",ekey,FileName)
        }

    }

}
func bookDataHanle(cid string ,nkey string,file string)  {
    fmt.Printf("CID ==>> %v \n",cid)
    jsonStr,err:= Helper.HttpPost("https://www.biquge18.com/home/index/ajaxchapter","id=73308&eKey="+nkey+"&basecid=1742&cid="+cid)
    if(err != nil){
        os.Exit(2)
    }
    var s CrawlService.ContentData
    json.Unmarshal([]byte(jsonStr), &s)
    fmt.Println(Helper.TrimHtml(s.Info.Title))
    appText:= "\n"+Helper.TrimHtml(s.Info.Title)+"\n"+Helper.TrimHtml(s.Info.Content)
    appendToFile(file,appText)
    fmt.Print("\n")

    fmt.Printf("s.Info.Nextcid ==> %v \n",s.Info.Nextcid)
    new_cid := strconv.Itoa(s.Info.Nextcid)
    fmt.Printf("new_cid ==> %s \n",new_cid)
    if(s.Info.Nextcid > 0){
        fmt.Print("sleep 3s.\n")
        time.Sleep(time.Duration(3) * time.Second)
        bookDataHanle(new_cid,nkey,file)
    }else{
        fmt.Print("sleep 3s.\n")
        time.Sleep(time.Duration(3) * time.Second)
        new_cid2,_:=strconv.Atoi(cid)
        new_cid_str := strconv.Itoa(new_cid2+1)
        bookDataHanle(new_cid_str,nkey,file)
    }
}
func createFile(fileName string)  {
    userFile := fileName
    fout, err := os.Create(userFile)
    if err != nil {
        fmt.Println(userFile, err)
        return
    }
    defer fout.Close()
}
func appendToFile(file, str string) {
    f, err := os.OpenFile(file, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0660)
    if err != nil {
        fmt.Printf("Cannot open file %s!\n", file)
        return
    }
    defer f.Close()
    f.WriteString(str)
}

1.1.1. CrawlService/crawl.go

 package CrawlService

import (
    "regexp"
)

type ContentData struct {
    Status string
    Info   *infoData
}
type infoData struct {
    Content string
    Title   string
    Nextcid int `json:"nextcid"`
}

func GetEkey(html string) (res bool, ekey string) {

    re, _ := regexp.MatchString("var hash = \"(.*?)\"", html)
    if (!re) {
        return re, "no match"
    }
    ma, _ := regexp.CompilePOSIX("var hash = \"(.*)\"")
    g := ma.FindAllStringSubmatch(html, -1)[0][1]
    if (g == "") {
        return false, ""
    }
    return true, g
}
func GetNextId(html string) (res bool, nextid string) {

    re, _ := regexp.MatchString("var nextcid = \"(.*?)\"", html)
    if (!re) {
        return re, "no match"
    }
    ma, _ := regexp.CompilePOSIX("var nextcid = \"(.*?)\"")
    g := ma.FindAllStringSubmatch(html, -1)[0][1]
    if (g == "") {
        return false, ""
    }
    return true, g
}

1.1.2. Helper/until.go

 package Helper

import (
    "fmt"
    "io/ioutil"
    "net/http"
    "os"
    "reflect"
    "regexp"
    "strings"
    "unsafe"
)
func HttpPost(uri string,params string) (contents string,err error){
    resp, err := http.Post(uri,
        "application/x-www-form-urlencoded",
        strings.NewReader(params))
    if err != nil {
        return  "",err
    }

    defer resp.Body.Close()
    body, err := ioutil.ReadAll(resp.Body)
    if err != nil {
        return "",err
    }
    return string(body),nil
}
func HttpGet(url string ) (contents string  , err error) {

    resp, err := http.Get(url)
    if err != nil {
        fmt.Fprintf(os.Stderr, "fetch: %v\n", err)
        return "",err
    }
    b, err := ioutil.ReadAll(resp.Body)
    resp.Body.Close()
    if err != nil {
        fmt.Fprintf(os.Stderr, "fetch: reading %s: %v\n", url, err)
        return "",err
    }

    return ByteString(b),nil
}

func ByteString(b []byte) string {
    return *(*string)(unsafe.Pointer(&b))
}

func StringByte(s string) []byte {
    sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
    bh := reflect.SliceHeader{
        Data: sh.Data,
        Len:  sh.Len,
        Cap:  sh.Len,
    }
    return *(*[]byte)(unsafe.Pointer(&bh))
}
//过滤HTml
func TrimHtml(src string) string {
    //将HTML标签全转换成小写
    re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
    src = re.ReplaceAllStringFunc(src, strings.ToLower)
    //去除STYLE
    re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
    src = re.ReplaceAllString(src, "")
    //去除SCRIPT
    re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
    src = re.ReplaceAllString(src, "")
    //去除所有尖括号内的HTML代码,并换成换行符
    re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
    src = re.ReplaceAllString(src, "\n")
    //去除连续的换行符
    re, _ = regexp.Compile("\\s{2,}")
    src = re.ReplaceAllString(src, "\n")
    return strings.TrimSpace(src)
}

results matching ""

    No results matching ""