Python异步爬取知乎热榜实例分享_Python

Python异步爬取知乎热榜实例分享

2022-11-27 12:10程序员班长 Python

这篇文章主要介绍了Python异步爬取知乎热榜实例分享，文章围绕Python异步爬取是我相关资料展开对知乎热榜爬取的相关内容，需要的小伙伴卡哇伊参考一下

一、错误代码：摘要和详细的url获取不到

				?

									import asyncio

									from bs4 import BeautifulSoup

									import aiohttp

									headers={

									    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',

									    'referer': 'https://www.baidu.com/s?tn=02003390_43_hao_pg&isource=infinity&iname=baidu&itype=web&ie=utf-8&wd=%E7%9F%A5%E4%B9%8E%E7%83%AD%E6%A6%9C'

									}

									async def getPages(url):

									    async with aiohttp.ClientSession(headers=headers) as session:

									        async with session.get(url) as resp:

									            print(resp.status)  # 打印状态码

									            html=await resp.text()

									    soup=BeautifulSoup(html,'lxml')

									    items=soup.select('.HotList-item')

									    for item in items:

									        title=item.select('.HotList-itemTitle')[0].text

									        try:

									            abstract=item.select('.HotList-itemExcerpt')[0].text

									        except:

									            abstract='No Abstract'

									        hot=item.select('.HotList-itemMetrics')[0].text

									        try:

									            img=item.select('.HotList-itemImgContainer img')['src']

									        except:

									            img='No Img'

									        print("{}\n{}\n{}".format(title,abstract,img))

									if __name__ == '__main__':

									    url='https://www.zhihu.com/billboard'

									    loop=asyncio.get_event_loop()

									    loop.run_until_complete(getPages(url))

									    loop.close()

Python异步爬取知乎热榜实例分享

二、查看JS代码

发现详细链接、图片链接、问题摘要等都在JS里面（CSDN的开发者助手插件确实好用）

Python异步爬取知乎热榜实例分享

正则表达式获取上述信息:

Python异步爬取知乎热榜实例分享

接下来就是详细的代码啦

				?

									import asyncio

									import json

									import re

									import aiohttp

									headers={

									    'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',

									    'referer': 'https://www.baidu.com/s?tn=02003390_43_hao_pg&isource=infinity&iname=baidu&itype=web&ie=utf-8&wd=%E7%9F%A5%E4%B9%8E%E7%83%AD%E6%A6%9C'

									}

									async def getPages(url):

									    async with aiohttp.ClientSession(headers=headers) as session:

									        async with session.get(url) as resp:

									            print(resp.status)  # 打印状态码

									            html=await resp.text()

									    regex=re.compile('"hotList":(.*?),"guestFeeds":')

									    text=regex.search(html).group(1)

									    # print(json.loads(text))   # json换成字典格式

									    for item in json.loads(text):

									        title=item['target']['titleArea']['text']

									        question=item['target']['excerptArea']['text']

									        hot=item['target']['metricsArea']['text']

									        link=item['target']['link']['url']

									        img=item['target']['imageArea']['url']

									        if not img:

									            img='No Img'

									        if not question:

									            question='No Abstract'

									        print("Title：{}\nPopular：{}\nQuestion：{}\nLink：{}\nImg：{}".format(title,hot,question,link,img))

									if __name__ == '__main__':

									    url='https://www.zhihu.com/billboard'

									    loop=asyncio.get_event_loop()

									    loop.run_until_complete(getPages(url))

									    loop.close()