菜单 学习猿地 - LMONKEY

VIP

开通学习猿地VIP

尊享10项VIP特权 持续新增

知识通关挑战

打卡带练!告别无效练习

接私单赚外块

VIP优先接,累计金额超百万

学习猿地私房课免费学

大厂实战课仅对VIP开放

你的一对一导师

每月可免费咨询大牛30次

领取更多软件工程师实用特权

入驻
191
0

go-爬段子

原创
05/13 14:22
阅读数 53557

爬取搞笑的段子,横向爬取+纵向爬取

横向爬取爬页数,纵向爬取,爬每页的内容

package main

import (
	"fmt"
	"io"
	"net/http"
	"os"
	"regexp"
	"strconv"
	"strings"
)

func HttpGet(url string) (result string, err error) {
	resp, err1 := http.Get(url)
	if err1 != nil {
		err = err1
		return
	}
	defer resp.Body.Close()
	buf := make([]byte, 4096)
	for {
		n, err2 := resp.Body.Read(buf)
		if n == 0 {
			break
		}
		if err2 != nil && err2 != io.EOF {
			err = err2
			return
		}
		result += string(buf[:n])
	}
	return
}

func SaveJoke2File(idx int, fileTitle, fileContent []string) {
	path := "第" + strconv.Itoa(idx) + "页.txt"
	f, err := os.Create(path)
	if err != nil {
		fmt.Println("err:", err)
		return
	}
	defer f.Close()
	n := len(fileTitle)
	for i := 0; i < n; i++ {
		f.WriteString(fileTitle[i] + "\n" + fileContent[i] + "\n")
		f.WriteString("-----------------------------\n")
	}
}

//抓取一个网页
func Spiderpage(idx int, page chan int) {
	url := "https://m.pengfue.com/xiaohua_" + strconv.Itoa(idx) + ".html"

	result, err := HttpGet(url)
	if err != nil {
		fmt.Println("httpget err", err)
		return
	}

	ret := regexp.MustCompile(`<h1 class="f18"><a href="(?s:(.*?))"`)

	alls := ret.FindAllStringSubmatch(result, -1)

	fileTitle := make([]string, 0)
	fileContent := make([]string, 0)

	for _, jokeURL := range alls {
		//		fmt.Println("jokeURL", jokeURL[1])
		title, content, err := SpiderJokePage(jokeURL[1])

		if err != nil {
			fmt.Println("err:", err)
			continue
		}

		//		fmt.Println("title:", title)
		//		fmt.Println("content:", content)
		fileTitle = append(fileTitle, title)
		fileContent = append(fileContent, content)
	}
	SaveJoke2File(idx, fileTitle, fileContent)

	page <- idx
}

func toWork(start, end int) {
	fmt.Printf("正在爬取%d到%d页。。。\n", start, end)

	page := make(chan int)

	for i := start; i <= end; i++ {
		//		title, content, err := Spiderpage(i)
		go Spiderpage(i, page)
		//		if err != nil {
		//			fmt.Println("err:", err)
		//			continue
		//		}
		//		fmt.Println("title:", title)
		//		fmt.Println("content:", content)
	}
	for i := start; i <= end; i++ {
		fmt.Printf("第%d个页面爬取完成\n", <-page)
	}
}

func SpiderJokePage(url string) (title, content string, err error) {
	result, err1 := HttpGet(url)
	if err1 != nil {
		//		fmt.Println("httpget err", err)
		err = err1
		return
	}
	ret1 := regexp.MustCompile(`<title>(?s:(.*?))</title>`)

	alls := ret1.FindAllStringSubmatch(result, 1) //两处,取一个
	for _, timTitle := range alls {
		title = timTitle[1]
		title = strings.Replace(title, " ", "", -1)
		title = strings.Replace(title, "\n", "", -1)
		break
	}

	ret2 := regexp.MustCompile(`<div class="con-txt">(?s:(.*?))</div>`)

	alls2 := ret2.FindAllStringSubmatch(result, 1) //两处,取一个
	for _, timTitle := range alls2 {
		content = timTitle[1]
		content = strings.Replace(content, " ", "", -1)
		content = strings.Replace(content, "\n", "", -1)
		content = strings.Replace(content, "	", "", -1)
		content = strings.Replace(content, "&nbsp;", "", -1)
		break
	}
	return
}

func main() {
	var start, end int
	fmt.Print("请输入起始页。。。")
	fmt.Scan(&start)
	fmt.Print("请输入终止页。。。")
	fmt.Scan(&end)

	toWork(start, end)
}

发表评论

0/200
191 点赞
0 评论
收藏