1. 利用GO 爬取小说
package main
import (
"bookget/CrawlService"
"bookget/Helper"
"encoding/json"
"fmt"
"os"
"strconv"
"time"
)
func main() {
fmt.Printf("Start ==> ")
uri := "https://www.biquge18.com/book/73308/1742.html";
FileName := "test1.txt";
content,err:= Helper.HttpGet(uri)
if(err != nil){
os.Exit(1)
}else{
r1,ekey := CrawlService.GetEkey(content)
fmt.Print(ekey)
if(r1){
createFile(FileName);
bookDataHanle("1742",ekey,FileName)
}
}
}
func bookDataHanle(cid string ,nkey string,file string) {
fmt.Printf("CID ==>> %v \n",cid)
jsonStr,err:= Helper.HttpPost("https://www.biquge18.com/home/index/ajaxchapter","id=73308&eKey="+nkey+"&basecid=1742&cid="+cid)
if(err != nil){
os.Exit(2)
}
var s CrawlService.ContentData
json.Unmarshal([]byte(jsonStr), &s)
fmt.Println(Helper.TrimHtml(s.Info.Title))
appText:= "\n"+Helper.TrimHtml(s.Info.Title)+"\n"+Helper.TrimHtml(s.Info.Content)
appendToFile(file,appText)
fmt.Print("\n")
fmt.Printf("s.Info.Nextcid ==> %v \n",s.Info.Nextcid)
new_cid := strconv.Itoa(s.Info.Nextcid)
fmt.Printf("new_cid ==> %s \n",new_cid)
if(s.Info.Nextcid > 0){
fmt.Print("sleep 3s.\n")
time.Sleep(time.Duration(3) * time.Second)
bookDataHanle(new_cid,nkey,file)
}else{
fmt.Print("sleep 3s.\n")
time.Sleep(time.Duration(3) * time.Second)
new_cid2,_:=strconv.Atoi(cid)
new_cid_str := strconv.Itoa(new_cid2+1)
bookDataHanle(new_cid_str,nkey,file)
}
}
func createFile(fileName string) {
userFile := fileName
fout, err := os.Create(userFile)
if err != nil {
fmt.Println(userFile, err)
return
}
defer fout.Close()
}
func appendToFile(file, str string) {
f, err := os.OpenFile(file, os.O_CREATE|os.O_APPEND|os.O_RDWR, 0660)
if err != nil {
fmt.Printf("Cannot open file %s!\n", file)
return
}
defer f.Close()
f.WriteString(str)
}
1.1.1. CrawlService/crawl.go
package CrawlService
import (
"regexp"
)
type ContentData struct {
Status string
Info *infoData
}
type infoData struct {
Content string
Title string
Nextcid int `json:"nextcid"`
}
func GetEkey(html string) (res bool, ekey string) {
re, _ := regexp.MatchString("var hash = \"(.*?)\"", html)
if (!re) {
return re, "no match"
}
ma, _ := regexp.CompilePOSIX("var hash = \"(.*)\"")
g := ma.FindAllStringSubmatch(html, -1)[0][1]
if (g == "") {
return false, ""
}
return true, g
}
func GetNextId(html string) (res bool, nextid string) {
re, _ := regexp.MatchString("var nextcid = \"(.*?)\"", html)
if (!re) {
return re, "no match"
}
ma, _ := regexp.CompilePOSIX("var nextcid = \"(.*?)\"")
g := ma.FindAllStringSubmatch(html, -1)[0][1]
if (g == "") {
return false, ""
}
return true, g
}
1.1.2. Helper/until.go
package Helper
import (
"fmt"
"io/ioutil"
"net/http"
"os"
"reflect"
"regexp"
"strings"
"unsafe"
)
func HttpPost(uri string,params string) (contents string,err error){
resp, err := http.Post(uri,
"application/x-www-form-urlencoded",
strings.NewReader(params))
if err != nil {
return "",err
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
return "",err
}
return string(body),nil
}
func HttpGet(url string ) (contents string , err error) {
resp, err := http.Get(url)
if err != nil {
fmt.Fprintf(os.Stderr, "fetch: %v\n", err)
return "",err
}
b, err := ioutil.ReadAll(resp.Body)
resp.Body.Close()
if err != nil {
fmt.Fprintf(os.Stderr, "fetch: reading %s: %v\n", url, err)
return "",err
}
return ByteString(b),nil
}
func ByteString(b []byte) string {
return *(*string)(unsafe.Pointer(&b))
}
func StringByte(s string) []byte {
sh := (*reflect.StringHeader)(unsafe.Pointer(&s))
bh := reflect.SliceHeader{
Data: sh.Data,
Len: sh.Len,
Cap: sh.Len,
}
return *(*[]byte)(unsafe.Pointer(&bh))
}
func TrimHtml(src string) string {
re, _ := regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllStringFunc(src, strings.ToLower)
re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")
src = re.ReplaceAllString(src, "")
re, _ = regexp.Compile("\\<[\\S\\s]+?\\>")
src = re.ReplaceAllString(src, "\n")
re, _ = regexp.Compile("\\s{2,}")
src = re.ReplaceAllString(src, "\n")
return strings.TrimSpace(src)
}