Python中如何使用requsets获取知乎最有价值的内容

2023-07-21,

本篇文章为大家展示了Python中如何使用requsets获取知乎最有价值的内容,内容简明扼要并且容易理解,绝对能使你眼前一亮,通过这篇文章的详细介绍希望你能有所收获。

一 前言 
  使用requsets 爬取知乎中最优价值的内容,写一段获取内容的python程序。

二 践行

  1. #!/usr/bin/env python

  2. #-*- coding:utf-8 -*-

  3. import re

  4. import requests

  5. import os

  6. from urlparse import urlsplit

  7. from os.path import basename

  8. def getHtml(url):

  9.     session = requests.Session()

  10.     # 模拟浏览器访问

  11.     header = {

  12.         'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",

  13.         'Accept-Encoding': 'gzip, deflate'}

  14.     res = session.get(url, headers=header)

  15.     if res.status_code == 200:

  16.         content = res.content

  17.     else:

  18.         content = ''

  19.     return content

  20. def mkdir(path):

  21.     if not os.path.exists(path):

  22.         print '新建文件夹:', path

  23.         os.makedirs(path)

  24.         return True

  25.     else:

  26.         print u"图片存放于:", os.getcwd() + os.sep + path

  27.         return False

  28. def download_pic(img_lists, dir_name):

  29.     print "一共有 {num} 张照片".format(num=len(img_lists))

  30.     for image_url in img_lists:

  31.         response = requests.get(image_url, stream=True)

  32.         if response.status_code == 200:

  33.             image = response.content

  34.         else:

  35.             continue

  36.         file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])

  37.         try:

  38.             with open(file_name, "wb") as picture:

  39.                 picture.write(image)

  40.         except IOError:

  41.             print("IO Error\n")

  42.             return

  43.         finally:

  44.             picture.close

  45.             print "下载 {pic_name} 完成!".format(pic_name=file_name)

  46. def getAllImg(html):

  47.     # 利用正则表达式把源代码中的图片地址过滤出来

  48.     #reg = r'data-actualsrc="(.*?)">'

  49.     reg = r'https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg'

  50.     imgre = re.compile(reg, re.S)

  51.     tmp_list = imgre.findall(html) # 表示在整个网页中过滤出所有图片的地址,放在imglist中

  52.     # 清理掉头像和去重 获取data-original的内容

  53.     tmp_list = list(set(tmp_list)) # 去重

  54.     imglist = []

  55.     for item in tmp_list:

  56.         if item.endswith('r.jpg'):

  57.             img_list.append(item)

  58.     print 'num : %d' % (len(imglist))

  59.     return imglist

  60. if __name__ == '__main__':

  61.     question_id = 35990613

  62.     zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)

  63.     html_content = getHtml(zhihu_url)

  64.     path = 'zhihu_pic'

  65.     mkdir(path) # 创建本地文件夹

  66.     img_list = getAllImg(html_content) # 获取图片的地址列表

  67.     download_pic(img_list, path)       # 保存图片

本代码还存在一些不足的地方,无法完全获取全部的图片,需要在兼容 自动点击 ”更多“ 加载更多答案。
代码第二版解决了第一版代码中不能自动加载的问题。

  1. #!/usr/bin/env python

  2. #-*- coding:utf-8 -*-

  3. import re

  4. import requests

  5. import os

  6. from urlparse import urlsplit

  7. from os.path import basename

  8. headers = {

  9.     'User-Agent': "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36",

  10.     'Accept-Encoding': 'gzip, deflate'}

  11. def mkdir(path):

  12.     if not os.path.exists(path):

  13.         print '新建文件夹:', path

  14.         os.makedirs(path)

  15.         return True

  16.     else:

  17.         print u"图片存放于:", os.getcwd() + os.sep + path

  18.         return False

  19. def download_pic(img_lists, dir_name):

  20.     print "一共有 {num} 张照片".format(num=len(img_lists))

  21.     for image_url in img_lists:

  22.         response = requests.get(image_url, stream=True)

  23.         if response.status_code == 200:

  24.             image = response.content

  25.         else:

  26.             continue

  27.         file_name = dir_name + os.sep + basename(urlsplit(image_url)[2])

  28.         try:

  29.             with open(file_name, "wb") as picture:

  30.                 picture.write(image)

  31.         except IOError:

  32.             print("IO Error\n")

  33.             continue

  34.         finally:

  35.             picture.close

  36.             print "下载 {pic_name} 完成!".format(pic_name=file_name)

  37. def get_image_url(qid, headers):

  38.     # 利用正则表达式把源代码中的图片地址过滤出来

  39.     #reg = r'data-actualsrc="(.*?)">'

  40.     tmp_url = "https://www.zhihu.com/node/QuestionAnswerListV2"

  41.     size = 10

  42.     image_urls = []

  43.     session = requests.Session()

  44.     # 利用循环自动完成需要点击 “更多” 获取所有答案,每个分页作为一个answer集合。

  45.     while True:

  46.         postdata = {'method': 'next', 'params': '{"url_token":' +

  47.                     str(qid) + ',"pagesize": "10",' + '"offset":' + str(size) + "}"}

  48.         page = session.post(tmp_url, headers=headers, data=postdata)

  49.         ret = eval(page.text)

  50.         answers = ret['msg']

  51.         size += 10

  52.         if not answers:

  53.             print "图片URL获取完毕, 页数: ", (size - 10) / 10

  54.             return image_urls

  55.         #reg = r'https://pic\d.zhimg.com/[a-fA-F0-9]{5,32}_\w+.jpg'

  56.         imgreg = re.compile('data-original="(.*?)"', re.S)

  57.         for answer in answers:

  58.             tmp_list = []

  59.             url_items = re.findall(imgreg, answer)

  60.             for item in url_items: # 这里去掉得到的图片URL中的转义字符'\\'

  61.                 image_url = item.replace("\\", "")

  62.                 tmp_list.append(image_url)

  63.             # 清理掉头像和去重 获取data-original的内容

  64.             tmp_list = list(set(tmp_list)) # 去重

  65.             for item in tmp_list:

  66.                 if item.endswith('r.jpg'):

  67.                     print item

  68.                     image_urls.append(item)

  69.         print 'size: %d, num : %d' % (size, len(image_urls))

  70. if __name__ == '__main__':

  71.     question_id = 26037846

  72.     zhihu_url = "https://www.zhihu.com/question/{qid}".format(qid=question_id)

  73.     path = 'zhihu_pic'

  74.     mkdir(path) # 创建本地文件夹

  75.     img_list = get_image_url(question_id, headers) # 获取图片的地址列表

  76.     download_pic(img_list, path) # 保存图片

上述内容就是Python中如何使用requsets获取知乎最有价值的内容,你们学到知识或技能了吗?如果还想学到更多技能或者丰富自己的知识储备,欢迎关注本站行业资讯频道。