使用Python3编写抓取网页和只抓网页图片的脚本_Python

使用Python3编写抓取网页和只抓网页图片的脚本

2020-07-30 11:32damotiansheng Python

这篇文章主要介绍了使用Python3编写抓取网页和只抓网页图片的脚本,使用到了urllib模块,需要的朋友可以参考下

最基本的抓取网页内容的代码实现：

				?

									#!/usr/bin/env python 

									from urllib import urlretrieve 

									def firstNonBlank(lines): 

									  for eachLine in lines: 

									    if not eachLine.strip(): 

									      continue

									    else: 

									      return eachLine 

									def firstLast(webpage): 

									  f = open(webpage) 

									  lines = f.readlines() 

									  f.close() 

									  print firstNonBlank(lines), 

									  lines.reverse() 

									  print firstNonBlank(lines), 

									def download(url='http://www',process=firstLast): 

									  try: 

									    retval = urlretrieve(url)[0] 

									  except IOError: 

									    retval = None

									  if retval: 

									    process(retval) 

									if __name__ == '__main__': 

									  download()

利用urllib模块，来实现一个网页中针对图片的抓取功能：

				?

									import urllib.request 

									import socket 

									import re 

									import sys 

									import os 

									targetDir = r"C:\Users\elqstux\Desktop\pic"

									def destFile(path): 

									  if not os.path.isdir(targetDir): 

									    os.mkdir(targetDir) 

									  pos = path.rindex('/') 

									  t = os.path.join(targetDir, path[pos+1:]) 

									  return t 

									if __name__ == "__main__": 

									  hostname = "http://www.douban.com"

									  req = urllib.request.Request(hostname) 

									  webpage = urllib.request.urlopen(req) 

									  contentBytes = webpage.read() 

									  for link, t in set(re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes))): 

									    print(link) 

									    urllib.request.urlretrieve(link, destFile(link))

				?

									import urllib.request 

									import socket 

									import re 

									import sys 

									import os 

									targetDir = r"H:\pic"

									def destFile(path): 

									  if not os.path.isdir(targetDir): 

									    os.mkdir(targetDir) 

									  pos = path.rindex('/') 

									  t = os.path.join(targetDir, path[pos+1:]) #会以/作为分隔 

									  return t 

									if __name__ == "__main__": 

									  hostname = "http://www.douban.com/"

									  req = urllib.request.Request(hostname) 

									  webpage = urllib.request.urlopen(req) 

									  contentBytes = webpage.read() 

									  match = re.findall(r'(http:[^\s]*?(jpg|png|gif))', str(contentBytes) )#r'(http:[^\s]*?(jpg|png|gif))'中包含两层圆括号，故有两个分组， 

									                             #上面会返回列表，括号中匹配的内容才会出现在列表中 

									  for picname, picType in match: 

									    print(picname) 

									    print(picType) 

									''''' 

									输出： 

									http://img3.douban.com/pics/blank.gif 

									gif 

									http://img3.douban.com/icon/g111328-1.jpg 

									jpg 

									http://img3.douban.com/pics/blank.gif 

									gif 

									http://img3.douban.com/icon/g197523-19.jpg 

									jpg 

									http://img3.douban.com/pics/blank.gif 

									gif 

									... 

									'''