菜单 学习猿地 - LMONKEY

VIP

开通学习猿地VIP

尊享10项VIP特权 持续新增

知识通关挑战

打卡带练!告别无效练习

接私单赚外块

VIP优先接,累计金额超百万

学习猿地私房课免费学

大厂实战课仅对VIP开放

你的一对一导师

每月可免费咨询大牛30次

领取更多软件工程师实用特权

入驻
247
0

爬虫初识

原创
05/13 14:22
阅读数 46883

一:requests 模块的基本使用

              # 1.requests 的基本使用
              import requests

              # 路由
              url = 'https://www.sogou.com/'

              # get 返回响应对象
              response = requests.get(url = url)

              # 响应响应数据
              page_text = response.text

              # 进行持久化存储
              with open('./sougou.html','w',encoding='utf-8') as fp:
                  fp.write(page_text)

              print('生而无畏,战至终章')

二:制作一个简易的网页存储系统

          # 目标:制作一个网页采集器  可以动态化存储

          import requests

          key = input('请输入想要查询的内容:')

          # 1.将参数动态化
          params = {
              'query':key
          }
          # 进行ua伪装
          headers = {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
          }

          # 路由:只要问号之前的内容
          url = 'https://www.sogou.com/web'

          response = requests.get(url=url,params=params,headers=headers)

          page_text = response.text

          # 动态化为持久化存储建立文件名
          fileName = key+'.html'

          with open(fileName,'w',encoding='utf-8') as fp:
              fp.write(page_text)
          print('生而无畏,战至终章')

三:动态数据加载相关

        # 爬取豆瓣数据

        import requests

        # 进行ua伪装
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }


        url= 'https://movie.douban.com/j/chart/top_list'

        params = {
            'type': '11',
            'interval_id': '100:90',
            'action': '',
            'start': '0',
            'limit': '1',
        }

        json_data = requests.get(url=url,headers=headers,params=params).json()
        print(json_data)

四:post请求操作

          # 爬取肯德基
          import requests

          # 进行ua伪装
          headers = {
              'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
          }

          url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
          #
          # # 获取五页数据
          #
          a = 1
          for pageNum in range(1,20):
              data = {
                  'cname': '',
                  'pid': '',
                  'keyword': '上海',
                  'pageIndex': pageNum,
                  'pageSize': '10',
              }
              json_data = requests.post(url=url,headers=headers,data=data).json()['Table1']
              for dic in json_data:
                  print(dic['addressDetail'])

                  print(a)
                  a+=1

五:批量获取药监总局的数据
import requests

      # 进行ua伪装
      headers = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
      }

      '''
      1.需要将一家企业的详情数据获取
          - 查看企业详情数据是否为动态加载数据?
              - 基于抓包工具进行局部搜索(是动态加载数据)
          - 基于抓包工具进行全局搜索定位动态加载数据对应的数据包
      2.如何获取每一家企业ID

      '''
      ids = []
      #批量获取企业ID
      main_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
      for pageNum in range(1,6):
          data = {
              'on': 'true',
              'page': str(pageNum),
              'pageSize': '15',
              'productName':'' ,
              'conditionType': '1',
              'applyname':'',
              'applysn':'' ,
          }

          data_list = requests.post(main_url,headers=headers,data=data).json()['list']
          for dic in data_list:
              _id = dic['ID']
              ids.append(_id)
      #基于id获取每家企业的详情数据
      url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'

      for _id in ids:
          data = {
              'id': _id
          }
          json_data = requests.post(url=url, headers=headers, data=data).json()
          print(json_data['epsName'])

六:爬取荣耀的线下店铺
import requests

        # 进行ua伪装
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
        }
        #批量获取商铺id
        main_url = 'https://openapi.vmall.com/mcp/offlineshop/getShopList'

        data = {"portal":2,"lang":"zh-CN","country":"CN","brand":1,"province":"北京","city":"北京","pageNo":1,"pageSize":20}


        json_data = requests.post(main_url,headers=headers,json=data).json()['shopInfos']

        url = 'https://openapi.vmall.com/mcp/offlineshop/getShopById'

七:图片的简单爬取
import requests

      # 进行ua伪装
      headers = {
          'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
      }


      # img_src = 'https://i03piccdn.sogoucdn.com/e931046dcf5606d8'
      #
      # img_data = requests.get(img_src,headers=headers).content
      #
      # with open('./123.jpg','wb') as fp:
      #     fp.write(img_data)


      from urllib import request
      img_src = 'https://i03piccdn.sogoucdn.com/e931046dcf5606d8'
      request.urlretrieve(img_src,'./456.jpg')

八:bs4的简单使用
# 注意:没有小说

      from bs4 import BeautifulSoup
      fp = open('../tools/test.html','r',encoding='utf-8')
      soup = BeautifulSoup(fp,'lxml')
      #标签定位
      #soup.tagName:可以将第一次出现的该标签定位到
      # print(soup.div)
      #属性定位:find(tagName,attrName='attrValue')
      # print(soup.find('div',class_='song'))
      # print(soup.findAll('div',class_='song'))
      #选择器定位
      # print(soup.select('#feng'))
      #层级选择器:>表示一个层级,空格可以表示多个层级
      # print(soup.select('.tang li > a'))

      #取文本
      # a_tag = soup.select('#feng')[0]
      # print(a_tag.string) #取直系文本

      # div_tag = soup.select('.song')[0]
      # print(div_tag.text) #取所有文本
      #取属性
      a_tag = soup.select('#feng')[0]
      print(a_tag)
      print(a_tag['href'])

九:爬取小说
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
}

  url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
  response = requests.get(url,headers=headers)
  response.encoding = 'utf-8'
  page_text = response.text
  fp = open('./sanguo.txt','w',encoding='utf-8')
  #数据解析:章节标题+详情页的urlfp.close()
  soup = BeautifulSoup(page_text,'lxml')
  a_list = soup.select('.book-mulu > ul > li > a')
  for a in a_list:
      title = a.string
      detail_url = 'https://www.shicimingju.com'+a['href']
      response = requests.get(detail_url,headers=headers)
      response.encoding = 'utf-8'
      page_text_detail = response.text
      soup = BeautifulSoup(page_text_detail,'lxml')
      div_tag = soup.find('div',class_='chapter_content')
      content = div_tag.text
      fp.write(title+':'+content+'\n')
      print(title,'保存成功!')

十:xpath的使用
from lxml import etree
tree = etree.parse('../tools/test.html')
#标签定位
#最左侧的/:必须从根节点逐层往下定位标签
#最左侧的//:从任意位置定位指定标签

    # tag = tree.xpath('/html/head/title')
    # tag = tree.xpath('/html//title')
    # tag = tree.xpath('//title')
    #属性定位
    # tag = tree.xpath('//div[@class="tang"]')
    # print(tag)
    #索引定位:索引时从1开始
    # tag = tree.xpath('//div[@class="tang"]/ul/li[3]')
    # print(tag)

    #取文本:/text()  //text()
    # div_tag = tree.xpath('//div[@class="tang"]//text()')
    # print(''.join(div_tag))
    #取属性:/@attrName
    # a = tree.xpath('//a[@id="feng"]/@href')[0]
    # print(a)

十一:图片数据的终极爬取

    # 爬取单页里面的所有图片



    # import requests
    # from lxml import etree
    # import os
    #
    # dirName = 'imgLibs'
    # if not os.path.exists(dirName):
    #     os.mkdir(dirName)
    #
    # headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
    # }
    #
    # url = 'https://pic.netbian.com/4kmeinv/'
    # response = requests.get(url,headers=headers)
    # response.encoding = 'gbk'
    #
    # page_text = response.text
    #
    # # 数据解析,图片地址+图片名称
    # tree = etree.HTML(page_text)   #将要解析的数据加载到tree中
    #
    # li_list= tree.xpath('//*[@id="main"]/div[3]/ul/li')   #使用工具点击一张图片然后找到上一级标签,右键copy=>copyxpath  因为是上一级,所以要在最后加上一盒li
    #
    # # 将li_list 包含的名字和图片链接拿出来
    #
    # for li in li_list:    #做局部数据解析
    #     title =li.xpath('./a/b/text()')[0]+'.jpg'      #. 代表向下一级寻找      拿出图片名称
    #     src = 'https://pic.netbian.com'+li.xpath('./a/img/@src')[0]   #拿出图片路径
    #     img_data = requests.get(src,headers=headers).content
    #     filePath = dirName+'/'+title
    #     with open(filePath,'wb') as fp:
    #         fp.write(img_data)
    #     print(title,'下载成功')



    # 爬取多页图片

    # import requests
    # from lxml import etree
    # import os
    # dirName = 'imgLibs'
    # if not os.path.exists(dirName):
    #     os.mkdir(dirName)
    #
    # headers = {
    #     'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
    # }
    #
    # url = 'https://pic.netbian.com/4kmeinv/index_%d.html'
    # for page in range(1,175):
    #     if page == 1:
    #         new_url = 'https://pic.netbian.com/4kmeinv/'
    #     else:
    #         new_url = format(url%page)
    #
    #     response = requests.get(new_url,headers=headers)
    #     response.encoding = 'gbk'
    #     page_text = response.text
    #     #数据解析:图片地址+图片名称
    #     tree = etree.HTML(page_text)
    #     li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
    #     for li in li_list:#局部数据解析
    #         title = li.xpath('./a/b/text()')[0]+'.jpg' #.就表示xpath的调用者对应的标签
    #         src = 'https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
    #         img_data = requests.get(src,headers=headers).content
    #         filePath = dirName+'/'+title
    #         with open(filePath,'wb') as fp:
    #             fp.write(img_data)
    #         print(title,'下载成功!')

相关热门文章

发表评论

0/200
247 点赞
0 评论
收藏