Requests-HTML

📁 <Category 8> 📅 2026-04-04 13:39:51 👁️ 1 次阅读

📖 正文内容

GET请求

from requests_html import HTMLSession
session = HTMLSession()
url = 'http://news.youth.cn/'
response = session.get(url)
response.encoding='gb2312'
print(response.text)

POST请求

from requests_html import HTMLSession
session = HTMLSession()
url = 'http://httpbin.org/post'
data = {'user':'admin','password':123456}
response = session.post(url=url,data=data)
if response.status_code == 200:
    print(response.text)

修改请求头信息

from requests_html import HTMLSession
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
url = 'http://httpbin.org/post'
data = {'user':'admin','password':123456}
session = HTMLSession()
response = session.post(url=url,data=data,headers=headers)
print(response.text)

生成随机的请求头信息

from requests_html import HTMLSession,UserAgent
session = HTMLSession()
user_agent = UserAgent().random
url = 'http://httpbin.org/get'
response = session.get(url,headers={'User-Agent':user_agent})
if response.status_code == 200:
    print(response.text)

数据的提取

以往使用requests模块实现爬虫程序时，还需要为其配置一个解析HTML代码的搭档。Request-HTML模块对此进行了一个比较大的升级，不仅支持CSS选择器，还支持XPath的节点提取方式。

CSS 选择器

CSS选择器中需要使用HTML的find()方法

find(selector:str='*',containing:_Contraining=None,clean:bool=False,first:bool=False,_ending:str=None)
参数：
selector: 使用CSS选择器定位网页元素
containing: 通过指定文本获取网页元素
clean: 是否清楚HTML中的<script>和<style>标签，默认为False不清除
first: 是否只返回网页中第一个元素，默认为False,表示全部返回来
_encoding: 表示编码格式

XPath选择器

XPath选择器同样需要使用HTML进行调用。

xpath(selector:str,clean:bool=False,first:bool=False,_encoding:str=None)
参数：
selector:使用XPath选择器定位网页元素
clean: 是否清楚HTML中的<script>和<style>标签，默认为False不清除
first:是否只返回网页中第一个元素，默认为False,表示全部返回来
_encoding: 表示编码格式

爬取及时新闻

from requests_html import HTMLSession,UserAgent
session = HTMLSession()
headers = {
    'User-Agent':UserAgent().random
}
url = 'http://news.youth.cn/jsxw/index.htm'
response = session.get(url=url,headers=headers)
response.encoding = 'gb2312'
if response.status_code == 200:
    html = response.html
    all_li = html.xpath('//ul[@class="tj3_1"]/li')
    for li in all_li:
        news_name = li.find('a')[0].text
        print('news_name:',news_name)
        # 获取新闻详细对应地址
        news_href = 'http://news.youth.cn/jsxw' +  li.find('a')[0].attrs.get('href').lstrip('.')
        print('新闻链接:',news_href)
        news_time = li.find('font')[0].text
        print('新闻发布时间:',news_time)

find()方法中containing参数

如果需要获取<li>标签中指定的新闻内容时，可以使用find()方法中的containing参数,以获得"新馆疫情"相关新闻内容为例

from requests_html import HTMLSession,UserAgent
session = HTMLSession()
ua = UserAgent().random
url = 'http://news.youth.cn/jsxw/index.htm'
response = session.get(url=url,headers={'user-agent':ua})
response.encoding = 'gb2312'
if response.status_code == 200:
    for li in response.html.find('li',containing='新冠'):
        news_title = li.find('a')[0].text # 提取新闻标题内容
        print(news_title)
        news_href = 'http://news.youth.cn/jsxw' + li.find('a[href]')[0].attrs.get('href').lstrip('.')
        print('新闻url地址为:',news_href)
        news_time = li.find('font')[0].text
        print(news_time)

search()方法与search_all()方法

通过关键字提取相应的数据信息，其中,search()方法表示查找符合条件的第一个元素，而search_all()方法则表示符合条件的所有元素。

from requests_html import HTMLSession,UserAgent
session = HTMLSession()
headers = {
    'User-Agent':UserAgent().random
}
url = 'http://news.youth.cn/jsxw/index.htm'
response = session.get(url=url,headers=headers)
if response.status_code == 200:
    html = response.html
    for li in html.find('li',containing='新冠'):
        a = li.search('<a href="{}">{}</a>')
        news_title = a[1]
        news_href = 'http://news.youth.cn/jsxw' + a[0].lstrip('.')
        news_time = li.search('<font>{}</font>')[0]
        print('新闻标题:',news_title,',新闻链接:',news_href,',日期:',news_time)

获取动态加载数据

在爬取网页数据时，经常会遇到直接对网页地址发送请求，可返回的HTML代码中并没有所需要的的数据的情况，遇到这种情况，多数都是因为网页数据使用了Ajax请求并由JavaScript渲染到网页当中。针对该问题，Requests-HTML模块提供了render()方法，第一次调用该方法将会自动下载Chromium浏览器，然后通过该浏览器直接加载JavaScript渲染后的信息

from requests_html import HTMLSession,UserAgent # 导入HTMLSession类
session = HTMLSession()          # 创建HTML会话对象
ua = UserAgent().random          # 创建随机请求头
# 发送网路请求
r = session.get('https://movie.douban.com/tag/#/?sort=U&range=0,10'
                '&tags=%E7%94%B5%E5%BD%B1,2020',headers = {'user-agent': ua})
r.encoding='gb2312'              # 编码
if r.status_code == 200:         # 判断请求是否成功
    r.html.render() # 调用render()方法，没有Chromium浏览器就自动下载
    # 获取当前页面中所有电影信息的a标签
    class_wp = r.html.xpath('.//div[@class="list-wp"]/a')  
    for a in class_wp:
        title = a.find('p span')[0].text  # 获取电影名称
        rate = a.find('p span')[1].text  # 获取电影评分
        details_url = a.attrs.get('href')  # 获取详情页url地址
        img_url = a.find('img')[0].attrs.get('src')  # 获取图片url地址
        print('电影名称为：', title)  # 打印电影名称
        print('电影评分为：', rate)  # 打印电影评分
        print('详情页地址为：', details_url)  # 打印电影详情页url地址
        print('图片地址为：', img_url)  # 打印电影图片地址

from requests_html import HTMLSession,UserAgent
session = HTMLSession()
headers = {'User-Agent':UserAgent().random}
url = 'https://movie.douban.com/cinema/nowplaying/hangzhou/'
response = session.get(url=url,headers=headers)
if response.status_code == 200:
    response.encoding = 'utf-8'
    response.html.render()
    html = response.html
    # 获取当前页面中所有电影信息的a标签
    all_li = html.xpath('//ul[@class="lists"]/li')
    for li in all_li:
        child_all_li = li.find('ul li') # 查找ul下所有的li
        img_li = child_all_li[0]
        movie_href = img_li.find('a')[0].attrs.get('href')
        movie_img = img_li.find('img')[0].attrs.get('src')
        movie_title_li = child_all_li[1]
        movie_title = movie_title_li.find('a')[0].text
        print({'movie_title':movie_title,'movie_href':movie_href,"movie_img":movie_img}

💡 示例代码

<div style='--en-codeblock:true;--en-blockId:37X-S6tVAA1;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession
session = HTMLSession()
url = 'http://news.youth.cn/'
response = session.get(url)
response.encoding='gb2312'
print(response.text)</div>

<div style='--en-codeblock:true;--en-blockId:hphTQ6Bus2g;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession
session = HTMLSession()
url = 'http://httpbin.org/post'
data = {'user':'admin','password':123456}
response = session.post(url=url,data=data)
if response.status_code == 200:
    print(response.text)</div>

<div style='--en-codeblock:true;--en-blockId:8ypHSqIlA3s;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
url = 'http://httpbin.org/post'
data = {'user':'admin','password':123456}
session = HTMLSession()
response = session.post(url=url,data=data,headers=headers)
print(response.text)</div>

<div style='--en-codeblock:true;--en-blockId:HtTXT6Tl4c5;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession,UserAgent
session = HTMLSession()
user_agent = UserAgent().random
url = 'http://httpbin.org/get'
response = session.get(url,headers={'User-Agent':user_agent})
if response.status_code == 200:
    print(response.text)</div>

<div style='--en-codeblock:true;--en-blockId:VM6yRCHtflz;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>find(selector:str='*',containing:_Contraining=None,clean:bool=False,first:bool=False,_ending:str=None)
参数：
selector: 使用CSS选择器定位网页元素
containing: 通过指定文本获取网页元素
clean: 是否清楚HTML中的&lt;script&gt;和&lt;style&gt;标签，默认为False不清除
first: 是否只返回网页中第一个元素，默认为False,表示全部返回来
_encoding: 表示编码格式</div>

<div style='--en-codeblock:true;--en-blockId:WNJGQWZ8Ng3;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>xpath(selector:str,clean:bool=False,first:bool=False,_encoding:str=None)
参数：
selector:使用XPath选择器定位网页元素
clean: 是否清楚HTML中的&lt;script&gt;和&lt;style&gt;标签，默认为False不清除
first:是否只返回网页中第一个元素，默认为False,表示全部返回来
_encoding: 表示编码格式</div>

<div style='--en-codeblock:true;--en-blockId:519URuzX3mq;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession,UserAgent
session = HTMLSession()
headers = {
    'User-Agent':UserAgent().random
}
url = 'http://news.youth.cn/jsxw/index.htm'
response = session.get(url=url,headers=headers)
response.encoding = 'gb2312'
if response.status_code == 200:
    html = response.html
    all_li = html.xpath('//ul[@class="tj3_1"]/li')
    for li in all_li:
        news_name = li.find('a')[0].text
        print('news_name:',news_name)
        # 获取新闻详细对应地址
        news_href = 'http://news.youth.cn/jsxw' +  li.find('a')[0].attrs.get('href').lstrip('.')
        print('新闻链接:',news_href)
        news_time = li.find('font')[0].text
        print('新闻发布时间:',news_time)</div>

<div style='--en-codeblock:true;--en-blockId:netQS2CK7pM;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession,UserAgent
session = HTMLSession()
ua = UserAgent().random
url = 'http://news.youth.cn/jsxw/index.htm'
response = session.get(url=url,headers={'user-agent':ua})
response.encoding = 'gb2312'
if response.status_code == 200:
    for li in response.html.find('li',containing='新冠'):
        news_title = li.find('a')[0].text # 提取新闻标题内容
        print(news_title)
        news_href = 'http://news.youth.cn/jsxw' + li.find('a[href]')[0].attrs.get('href').lstrip('.')
        print('新闻url地址为:',news_href)
        news_time = li.find('font')[0].text
        print(news_time)</div>

<div style='--en-codeblock:true;--en-blockId:igf2Q-nR7_W;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession,UserAgent
session = HTMLSession()
headers = {
    'User-Agent':UserAgent().random
}
url = 'http://news.youth.cn/jsxw/index.htm'
response = session.get(url=url,headers=headers)
if response.status_code == 200:
    html = response.html
    for li in html.find('li',containing='新冠'):
        a = li.search('&lt;a href="{}"&gt;{}&lt;/a&gt;')
        news_title = a[1]
        news_href = 'http://news.youth.cn/jsxw' + a[0].lstrip('.')
        news_time = li.search('&lt;font&gt;{}&lt;/font&gt;')[0]
        print('新闻标题:',news_title,',新闻链接:',news_href,',日期:',news_time)</div>

<div style='--en-codeblock:true;--en-blockId:te6ISq7k78v;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession,UserAgent # 导入HTMLSession类
session = HTMLSession()          # 创建HTML会话对象
ua = UserAgent().random          # 创建随机请求头
# 发送网路请求
r = session.get('https://movie.douban.com/tag/#/?sort=U&amp;range=0,10'
                '&amp;tags=%E7%94%B5%E5%BD%B1,2020',headers = {'user-agent': ua})
r.encoding='gb2312'              # 编码
if r.status_code == 200:         # 判断请求是否成功
    r.html.render() # 调用render()方法，没有Chromium浏览器就自动下载
    # 获取当前页面中所有电影信息的a标签
    class_wp = r.html.xpath('.//div[@class="list-wp"]/a')  
    for a in class_wp:
        title = a.find('p span')[0].text  # 获取电影名称
        rate = a.find('p span')[1].text  # 获取电影评分
        details_url = a.attrs.get('href')  # 获取详情页url地址
        img_url = a.find('img')[0].attrs.get('src')  # 获取图片url地址
        print('电影名称为：', title)  # 打印电影名称
        print('电影评分为：', rate)  # 打印电影评分
        print('详情页地址为：', details_url)  # 打印电影详情页url地址
        print('图片地址为：', img_url)  # 打印电影图片地址
</div>

<div style='--en-codeblock:true;--en-blockId:jIBUT-mbE1e;--en-meta:{"title":"","lang":"Python","theme":"default","showLine":true,"lineWrap":false};box-sizing: border-box; padding: 8px; font-family: Monaco, Menlo, Consolas, "Courier New", monospace; font-size: 12px; color: rgb(51, 51, 51); border-top-left-radius: 4px; border-top-right-radius: 4px; border-bottom-right-radius: 4px; border-bottom-left-radius: 4px; background-color: rgb(251, 250, 248); border: 1px solid rgba(0, 0, 0, 0.14902); background-position: initial initial; background-repeat: initial initial; margin-top: 6px;'>from requests_html import HTMLSession,UserAgent
session = HTMLSession()
headers = {'User-Agent':UserAgent().random}
url = 'https://movie.douban.com/cinema/nowplaying/hangzhou/'
response = session.get(url=url,headers=headers)
if response.status_code == 200:
    response.encoding = 'utf-8'
    response.html.render()
    html = response.html
    # 获取当前页面中所有电影信息的a标签
    all_li = html.xpath('//ul[@class="lists"]/li')
    for li in all_li:
        child_all_li = li.find('ul li') # 查找ul下所有的li
        img_li = child_all_li[0]
        movie_href = img_li.find('a')[0].attrs.get('href')
        movie_img = img_li.find('img')[0].attrs.get('src')
        movie_title_li = child_all_li[1]
        movie_title = movie_title_li.find('a')[0].text
        print({'movie_title':movie_title,'movie_href':movie_href,"movie_img":movie_img}</div>

💭 技巧提示

💡

Requests-HTML

📖 正文内容

GET请求

POST请求

修改请求头信息

生成随机的请求头信息

数据的提取

CSS 选择器

XPath选择器

爬取及时新闻

find()方法中containing参数

search()方法与search_all()方法

获取动态加载数据

💡 示例代码

💭 技巧提示

📚 参考资料

💬 评论交流

📖 正文内容

GET请求

POST请求

修改请求头信息

生成随机的请求头信息

数据的提取

CSS 选择器

XPath选择器

爬取及时新闻

find()方法中containing参数

search()方法与search_all()方法

获取动态加载数据

💡 示例代码

💭 技巧提示

📚 参考资料

🔗 相关文章

💬 评论交流