Python实现的文轩网爬虫完整示例_Python

本文实例讲述了python实现的文轩网爬虫。分享给大家供大家参考，具体如下：
				?

									encoding=utf8

									import pymysql

									import time

									import sys

									import requests

									import os

									#捕获错误

									import traceback

									import types

									#将html实体化

									import cgi

									import warnings

									reload(sys)

									sys.setdefaultencoding('utf-8')

									from pyquery import pyquery as pq

									from lxml import etree

									sys.setdefaultencoding('utf-8')

									#屏蔽错误

									warnings.filterwarnings("ignore")

									#下载图片

									def dowloadpic(imageurl,filepath):

									r = requests.get(imageurl,timeout=60)

									status=r.status_code

									if status == 404:

									return 404

									with open(filepath, "wb") as code:

									code.write(r.content)

									#根据详情页地址抓取数据并插入数据库

									def getdata(final_url):

									file_open=open('./url.txt', 'w')

									file_open.write(final_url)

									file_open.close()

									#链接数据库

									conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')

									#设置浮标

									cursor = conn.cursor(cursor=pymysql.cursors.dictcursor)

									#解析详情页面

									try:

									detail_url=final_url

									c=pq(detail_url)

									head=c('html').attr('xmlns')

									err='http://www.w3.org/1999/xhtml'

									err1='http://www.winxuan.com/cms/2016db_sh'

									if head == err or head == err1:

									return 'back'

									except exception, e:

									return 'back'

									i=0

									while i<12:

									  text = c('#page').find('.cont').find('li').eq(i).text()

									  text=text.replace('　','')

									  if 'i s b n' in text:

									    isbn=text.replace('i s b n：','')

									    isbn=isbn.strip()

									    sel='select count(*) from bi_book where isbn ='+isbn

									    cursor.execute(sel)

									    result=cursor.fetchone()

									    count=result['count(*)']

									    if count != 0 :

									      print u'已存在'

									      return 'back'

									  if 'isbn：' in text :

									    isbn=text.replace('isbn：','')

									    isbn=isbn.strip()

									    sel='select count(*) from bi_book where isbn ='+isbn

									    cursor.execute(sel)

									    result=cursor.fetchone()

									    count=result['count(*)']

									    if count != 0 :

									      print u'已存在'

									      return 'back'

									  if '作者：' in text :

									    author = text.replace('作者：','')

									  if '出版社：' in text :

									    press_name=text.replace('出版社：','')

									  if '版次：' in text :

									    edition=text.replace('版次：','')

									  if '印次：' in text :

									    impressions=text.replace('印次：','')

									  if '装帧：' in text :

									    packaging=text.replace('装帧：','')

									  if '开本：' in text:

									    size=text.replace('开本：','')

									  if '出版时间：' in text:

									    press_time=text.replace('出版时间：','')

									    press_time=press_time.strip()

									    if press_time == '无':

									      press_time='1970-01-01'

									  if '印刷时间：' in text:

									    print_time=text.replace('印刷时间：','')

									    print_time=print_time.strip()

									    if print_time== '无':

									      print_time='1970-01-01'

									  if '页数：' in text:

									    page_num=text.replace('页数：','')

									  if '字数：' in text:

									    word_num=text.replace('字数：','')

									  i+=1

									if ('author' in locals().keys()) == false:

									  author = ''

									if ('press_time' in locals().keys()) == false:

									  press_time = '1970-01-01'

									if ('print_time' in locals().keys()) == false:

									  print_time = '1970-01-01'

									if ('impressions' in locals().keys()) == false:

									  impressions = ''

									if ('edition' in locals().keys())== false:

									  edition = ''

									if ('page_num' in locals().keys())== false:

									  page_num = ''

									if ('word_num' in locals().keys())== false:

									  word_num = ''

									if ('packaging' in locals().keys())== false:

									  packaging = ''

									if ('size' in locals().keys())== false:

									  size = ''

									if ('press_name' in locals().keys())== false:

									  press_name = ''

									#暂无图片地址

									none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'

									#获取大小图地址

									big_path=c('.info-side').find('.img').find('a').find('img').attr('src')

									if big_path is none:

									  return 'back'

									elif big_path == none_img :

									  big_path=''

									  small_path=''

									else :

									  small_path=big_path.replace('_16','_11')

									#获取分类

									#先获取a标签html

									ahtml=c('#page').find('.base-nav').eq(0).html()

									#解析a标签html

									cate=pq(ahtml)

									#获取分类的最后一个分类

									category=cate('a:last').text()

									#获取书名

									name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()

									name=name.strip()

									#获取价格

									price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()

									price=price.replace('¥','')

									#循环获取内容简介和目录信息

									k=5

									while k<12:

									  title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()

									  if '内容简介' in title:

									    con=c('#page').find('.title').eq(k).nextall()

									    det=pq(con)

									    content=det('.text-words-1').html()

									    content=content.encode("utf8", "ignore");

									  if '目录' in title:

									    con=c('#page').find('.title').eq(k).nextall()

									    dry=pq(con)

									    directory=dry('.text-words-1').html()

									    directory=directory.encode("utf8", "ignore");

									  k+=1

									#如果内容简介和目录没有的时候指定为空字符串

									if ('content' in locals().keys())== false:

									  content = ''

									if ('directory' in locals().keys())== false:

									  directory = ''

									details  = '内容简介<br>'+content+'<br><br>目录<br>'+directory

									details=cgi.escape(details)

									#录入时间

									add_time = time.strftime('%y-%m-%d',time.localtime(time.time()))

									#下载小图

									#文件根目录

									root_path=sys.path[0]

									#创建isbn文件夹路径

									root_path=root_path.replace('\\','/')

									isbn_path=root_path+'/download/'+isbn

									if big_path != '' and small_path !='' :

									  #创建isbn目录

									  if os.path.isdir(isbn_path) ==false :

									    os.mkdir(isbn_path)

									    #组合下载后图片保存路径

									    down_img_small = isbn_path+"/small"+isbn+".jpg"

									    down_img_big  = isbn_path+'/big'+isbn+".jpg"

									    #调用下载图片方法

									    small_res=dowloadpic(small_path,down_img_small)

									    #大图保存数据库路径

									    big_res=dowloadpic(big_path,down_img_big)

									    #小图保存数据库路径

									    if small_res==404 :

									      img_small = 'none-picture/none-small.jpg'

									    else :

									      img_small = 'download/'+isbn+'/small'+isbn+'.jpg'

									    if big_res==404 :

									      img_big = 'none-picture/none-big.jpg'

									    else :

									      img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'

									  else :

									    #组合保存数据库中的图片路径

									    img_small = 'download/'+isbn+'/small'+isbn+'.jpg'

									    img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'

									else :

									  img_big = 'none-picture/none-big.jpg'

									  img_small = 'none-picture/none-small.jpg'

									source_type = 3

									try :

									  #要插入的列表

									  li=[0,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small]

									  #执行sql

									  sql="insert into bi_book (book_id,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"

									  aaa=cursor.execute(sql,li)

									  if aaa==1:

									    print u'插入成功'

									  conn.commit()

									except exception, e :

									  return 'back'

									def winxuan(n):

									#首页解析

									home_url='http://www.winxuan.com/'

									h=pq(home_url)

									#分类导航链接

									menu=h('.mod-mainmenu').find('dd').find('a').eq(n).attr('href')

									#print menu

									#分类书籍首页

									try:

									mh=pq(menu)

									except exception, e :

									return 'backs'

									# text=mh('.main').find('a').text()

									# text=text.encode("gbk", "ignore");

									li=[]

									u=0

									while u<248 :

									detail_urls=mh('.main').find('a').eq(u).attr('href')

									#将取到所有地址放入到列表当中

									li.append(detail_urls)

									u+=1

									#进行列表去重

									li=list(set(li))

									for final_url in li:

									try:

									result=getdata(final_url)

									except exception, e :

									continue

									if result=='back' :

									continue

									print 'ok,finished'

									n=0

									while n<58:

									while n<58:

									print n

									string=str(n)

									file_open=open('./number.txt', 'w')

									file_open.write(string)

									file_open.close()

									res=winxuan(n)

									n+=1

									if res=='backs' :

									continue