一:requests 模块的基本使用
# 1.requests 的基本使用
import requests
# 路由
url = 'https://www.sogou.com/'
# get 返回响应对象
response = requests.get(url = url)
# 响应响应数据
page_text = response.text
# 进行持久化存储
with open('./sougou.html','w',encoding='utf-8') as fp:
fp.write(page_text)
print('生而无畏,战至终章')
二:制作一个简易的网页存储系统
# 目标:制作一个网页采集器 可以动态化存储
import requests
key = input('请输入想要查询的内容:')
# 1.将参数动态化
params = {
'query':key
}
# 进行ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
# 路由:只要问号之前的内容
url = 'https://www.sogou.com/web'
response = requests.get(url=url,params=params,headers=headers)
page_text = response.text
# 动态化为持久化存储建立文件名
fileName = key+'.html'
with open(fileName,'w',encoding='utf-8') as fp:
fp.write(page_text)
print('生而无畏,战至终章')
三:动态数据加载相关
# 爬取豆瓣数据
import requests
# 进行ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url= 'https://movie.douban.com/j/chart/top_list'
params = {
'type': '11',
'interval_id': '100:90',
'action': '',
'start': '0',
'limit': '1',
}
json_data = requests.get(url=url,headers=headers,params=params).json()
print(json_data)
四:post请求操作
# 爬取肯德基
import requests
# 进行ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
#
# # 获取五页数据
#
a = 1
for pageNum in range(1,20):
data = {
'cname': '',
'pid': '',
'keyword': '上海',
'pageIndex': pageNum,
'pageSize': '10',
}
json_data = requests.post(url=url,headers=headers,data=data).json()['Table1']
for dic in json_data:
print(dic['addressDetail'])
print(a)
a+=1
五:批量获取药监总局的数据
import requests
# 进行ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
'''
1.需要将一家企业的详情数据获取
- 查看企业详情数据是否为动态加载数据?
- 基于抓包工具进行局部搜索(是动态加载数据)
- 基于抓包工具进行全局搜索定位动态加载数据对应的数据包
2.如何获取每一家企业ID
'''
ids = []
#批量获取企业ID
main_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
for pageNum in range(1,6):
data = {
'on': 'true',
'page': str(pageNum),
'pageSize': '15',
'productName':'' ,
'conditionType': '1',
'applyname':'',
'applysn':'' ,
}
data_list = requests.post(main_url,headers=headers,data=data).json()['list']
for dic in data_list:
_id = dic['ID']
ids.append(_id)
#基于id获取每家企业的详情数据
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for _id in ids:
data = {
'id': _id
}
json_data = requests.post(url=url, headers=headers, data=data).json()
print(json_data['epsName'])
六:爬取荣耀的线下店铺
import requests
# 进行ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
#批量获取商铺id
main_url = 'https://openapi.vmall.com/mcp/offlineshop/getShopList'
data = {"portal":2,"lang":"zh-CN","country":"CN","brand":1,"province":"北京","city":"北京","pageNo":1,"pageSize":20}
json_data = requests.post(main_url,headers=headers,json=data).json()['shopInfos']
url = 'https://openapi.vmall.com/mcp/offlineshop/getShopById'
七:图片的简单爬取
import requests
# 进行ua伪装
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
# img_src = 'https://i03piccdn.sogoucdn.com/e931046dcf5606d8'
#
# img_data = requests.get(img_src,headers=headers).content
#
# with open('./123.jpg','wb') as fp:
# fp.write(img_data)
from urllib import request
img_src = 'https://i03piccdn.sogoucdn.com/e931046dcf5606d8'
request.urlretrieve(img_src,'./456.jpg')
八:bs4的简单使用
# 注意:没有小说
from bs4 import BeautifulSoup
fp = open('../tools/test.html','r',encoding='utf-8')
soup = BeautifulSoup(fp,'lxml')
#标签定位
#soup.tagName:可以将第一次出现的该标签定位到
# print(soup.div)
#属性定位:find(tagName,attrName='attrValue')
# print(soup.find('div',class_='song'))
# print(soup.findAll('div',class_='song'))
#选择器定位
# print(soup.select('#feng'))
#层级选择器:>表示一个层级,空格可以表示多个层级
# print(soup.select('.tang li > a'))
#取文本
# a_tag = soup.select('#feng')[0]
# print(a_tag.string) #取直系文本
# div_tag = soup.select('.song')[0]
# print(div_tag.text) #取所有文本
#取属性
a_tag = soup.select('#feng')[0]
print(a_tag)
print(a_tag['href'])
九:爬取小说
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
}
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
fp = open('./sanguo.txt','w',encoding='utf-8')
#数据解析:章节标题+详情页的urlfp.close()
soup = BeautifulSoup(page_text,'lxml')
a_list = soup.select('.book-mulu > ul > li > a')
for a in a_list:
title = a.string
detail_url = 'https://www.shicimingju.com'+a['href']
response = requests.get(detail_url,headers=headers)
response.encoding = 'utf-8'
page_text_detail = response.text
soup = BeautifulSoup(page_text_detail,'lxml')
div_tag = soup.find('div',class_='chapter_content')
content = div_tag.text
fp.write(title+':'+content+'\n')
print(title,'保存成功!')
十:xpath的使用
from lxml import etree
tree = etree.parse('../tools/test.html')
#标签定位
#最左侧的/:必须从根节点逐层往下定位标签
#最左侧的//:从任意位置定位指定标签
# tag = tree.xpath('/html/head/title')
# tag = tree.xpath('/html//title')
# tag = tree.xpath('//title')
#属性定位
# tag = tree.xpath('//div[@class="tang"]')
# print(tag)
#索引定位:索引时从1开始
# tag = tree.xpath('//div[@class="tang"]/ul/li[3]')
# print(tag)
#取文本:/text() //text()
# div_tag = tree.xpath('//div[@class="tang"]//text()')
# print(''.join(div_tag))
#取属性:/@attrName
# a = tree.xpath('//a[@id="feng"]/@href')[0]
# print(a)
十一:图片数据的终极爬取
# 爬取单页里面的所有图片
# import requests
# from lxml import etree
# import os
#
# dirName = 'imgLibs'
# if not os.path.exists(dirName):
# os.mkdir(dirName)
#
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
# }
#
# url = 'https://pic.netbian.com/4kmeinv/'
# response = requests.get(url,headers=headers)
# response.encoding = 'gbk'
#
# page_text = response.text
#
# # 数据解析,图片地址+图片名称
# tree = etree.HTML(page_text) #将要解析的数据加载到tree中
#
# li_list= tree.xpath('//*[@id="main"]/div[3]/ul/li') #使用工具点击一张图片然后找到上一级标签,右键copy=>copyxpath 因为是上一级,所以要在最后加上一盒li
#
# # 将li_list 包含的名字和图片链接拿出来
#
# for li in li_list: #做局部数据解析
# title =li.xpath('./a/b/text()')[0]+'.jpg' #. 代表向下一级寻找 拿出图片名称
# src = 'https://pic.netbian.com'+li.xpath('./a/img/@src')[0] #拿出图片路径
# img_data = requests.get(src,headers=headers).content
# filePath = dirName+'/'+title
# with open(filePath,'wb') as fp:
# fp.write(img_data)
# print(title,'下载成功')
# 爬取多页图片
# import requests
# from lxml import etree
# import os
# dirName = 'imgLibs'
# if not os.path.exists(dirName):
# os.mkdir(dirName)
#
# headers = {
# 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36',
# }
#
# url = 'https://pic.netbian.com/4kmeinv/index_%d.html'
# for page in range(1,175):
# if page == 1:
# new_url = 'https://pic.netbian.com/4kmeinv/'
# else:
# new_url = format(url%page)
#
# response = requests.get(new_url,headers=headers)
# response.encoding = 'gbk'
# page_text = response.text
# #数据解析:图片地址+图片名称
# tree = etree.HTML(page_text)
# li_list = tree.xpath('//*[@id="main"]/div[3]/ul/li')
# for li in li_list:#局部数据解析
# title = li.xpath('./a/b/text()')[0]+'.jpg' #.就表示xpath的调用者对应的标签
# src = 'https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
# img_data = requests.get(src,headers=headers).content
# filePath = dirName+'/'+title
# with open(filePath,'wb') as fp:
# fp.write(img_data)
# print(title,'下载成功!')
© 著作权归作者所有
相关热门文章
发表评论