香港云主机最佳企业级服务商!

ADSL拨号VPS包含了中国大陆(联通,移动,电信,)

中国香港,国外拨号VPS。

当前位置:云主机 > python >

电信ADSL拨号VPS
联通ADSL拨号VPS
移动ADSL拨号VPS

Python实现的文轩网爬虫完整示例


时间:2022-04-02 10:36 作者:admin


本文实例讲述了python/' target='_blank'>python实现的文轩网爬虫。分享给大家供大家参考,具体如下:

encoding=utf8import pymysql/' target='_blank'>mysqlimport timeimport sysimport requestsimport os#捕获错误import tracebackimport types#将html实体化import cgiimport warningsreload(sys)sys.setdefaultencoding('utf-8')from pyquery import PyQuery as pqfrom lxml import etreesys.setdefaultencoding('utf-8')#屏蔽错误warnings.filterwarnings("ignore")#下载图片def dowloadPic(imageUrl,filePath):r = requests.get(imageUrl,timeout=60)status=r.status_codeif status == 404:return 404with open(filePath, "wb") as code:code.write(r.content)#根据详情页地址抓取数据并插入数据库def getData(final_url):file_open=open('./url.txt', 'w')file_open.write(final_url)file_open.close()#链接数据库conn = pyMySQL.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')#设置浮标cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)#解析详情页面try:detail_url=final_urlc=pq(detail_url)head=c('html').attr('xmlns')err='http://www.w3.org/1999/xhtml'err1='http://www.winxuan.com/cms/2016db_sh'if head == err or head == err1:return 'back'except Exception, e:return 'back'i=0while i<12:  text = c('#page').find('.cont').find('li').eq(i).text()  text=text.replace(' ','')  if 'I S B N' in text:    isbn=text.replace('I S B N:','')    isbn=isbn.strip()    sel='select count(*) from bi_book where isbn ='+isbn    cursor.execute(sel)    result=cursor.fetchone()    count=result['count(*)']    if count != 0 :      print u'已存在'      return 'back'  if 'isbn:' in text :    isbn=text.replace('isbn:','')    isbn=isbn.strip()    sel='select count(*) from bi_book where isbn ='+isbn    cursor.execute(sel)    result=cursor.fetchone()    count=result['count(*)']    if count != 0 :      print u'已存在'      return 'back'  if '作者:' in text :    author = text.replace('作者:','')  if '出版社:' in text :    press_name=text.replace('出版社:','')  if '版次:' in text :    edition=text.replace('版次:','')  if '印次:' in text :    impressions=text.replace('印次:','')  if '装帧:' in text :    packaging=text.replace('装帧:','')  if '开本:' in text:    size=text.replace('开本:','')  if '出版时间:' in text:    press_time=text.replace('出版时间:','')    press_time=press_time.strip()    if press_time == '无':      press_time='1970-01-01'  if '印刷时间:' in text:    print_time=text.replace('印刷时间:','')    print_time=print_time.strip()    if print_time== '无':      print_time='1970-01-01'  if '页数:' in text:    page_num=text.replace('页数:','')  if '字数:' in text:    word_num=text.replace('字数:','')  i+=1if ('author' in locals().keys()) == False:  author = ''if ('press_time' in locals().keys()) == False:  press_time = '1970-01-01'if ('print_time' in locals().keys()) == False:  print_time = '1970-01-01'if ('impressions' in locals().keys()) == False:  impressions = ''if ('edition' in locals().keys())== False:  edition = ''if ('page_num' in locals().keys())== False:  page_num = ''if ('word_num' in locals().keys())== False:  word_num = ''if ('packaging' in locals().keys())== False:  packaging = ''if ('size' in locals().keys())== False:  size = ''if ('press_name' in locals().keys())== False:  press_name = ''#暂无图片地址none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'#获取大小图地址big_path=c('.info-side').find('.img').find('a').find('img').attr('src')if big_path is None:  return 'back'elif big_path == none_img :  big_path=''  small_path=''else :  small_path=big_path.replace('_16','_11')#获取分类#先获取a标签htmlahtml=c('#page').find('.base-nav').eq(0).html()#解析a标签htmlcate=pq(ahtml)#获取分类的最后一个分类category=cate('a:last').text()#获取书名name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()name=name.strip()#获取价格price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()price=price.replace('¥','')#循环获取内容简介和目录信息k=5while k<12:  title=c('#page').find('.title').eq(k).find('.tab').find('h4').text()  if '内容简介' in title:    con=c('#page').find('.title').eq(k).nextAll()    det=pq(con)    content=det('.text-words-1').html()    content=content.encode("utf8", "ignore");  if '目录' in title:    con=c('#page').find('.title').eq(k).nextAll()    dry=pq(con)    directory=dry('.text-words-1').html()    directory=directory.encode("utf8", "ignore");  k+=1#如果内容简介和目录没有的时候指定为空字符串if ('content' in locals().keys())== False:  content = ''if ('directory' in locals().keys())== False:  directory = ''details  = '内容简介<br>'+content+'<br><br>目录<br>'+directorydetails=cgi.escape(details)#录入时间add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))#下载小图#文件根目录root_path=sys.path[0]#创建isbn文件夹路径root_path=root_path.replace('\\','/')isbn_path=root_path+'/download/'+isbnif big_path != '' and small_path !='' :  #创建isbn目录  if os.path.isdir(isbn_path) ==False :    os.mkdir(isbn_path)    #组合下载后图片保存路径    down_img_small = isbn_path+"/small"+isbn+".jpg"    down_img_big  = isbn_path+'/big'+isbn+".jpg"    #调用下载图片方法    small_res=dowloadPic(small_path,down_img_small)    #大图保存数据库路径    big_res=dowloadPic(big_path,down_img_big)    #小图保存数据库路径    if small_res==404 :      img_small = 'none-picture/none-small.jpg'    else :      img_small = 'download/'+isbn+'/small'+isbn+'.jpg'    if big_res==404 :      img_big = 'none-picture/none-big.jpg'    else :      img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'  else :    #组合保存数据库中的图片路径    img_small = 'download/'+isbn+'/small'+isbn+'.jpg'    img_big  = 'download/'+isbn+'/big'+isbn+'.jpg'else :  img_big = 'none-picture/none-big.jpg'  img_small = 'none-picture/none-small.jpg'source_type = 3try :  #要插入的列表  li=[0,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small]  #执行sql  sql="insert into bi_book (book_id,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"  aaa=cursor.execute(sql,li)  if aaa==1:    print u'插入成功'  conn.commit()except Exception, e :  return 'back'def winxuan(n):#首页解析home_url='http://www.winxuan.com/'h=pq(home_url)#分类导航链接menu=h('.mod-mainmenu').find('dd').find('a').eq(n).attr('href')#print menu#分类书籍首页try:mh=pq(menu)except Exception, e :return 'backs'# text=mh('.main').find('a').text()# text=text.encode("GBK", "ignore");li=[]u=0while u<248 :detail_urls=mh('.main').find('a').eq(u).attr('href')#将取到所有地址放入到列表当中li.append(detail_urls)u+=1#进行列表去重li=list(set(li))for final_url in li:try:result=getData(final_url)except Exception, e :continueif result=='back' :continueprint 'OK,finished'n=0while n<58:while n<58:print nstring=str(n)file_open=open('./number.txt', 'w')file_open.write(string)file_open.close()res=winxuan(n)n+=1if res=='backs' :continue

更多关于Python相关内容可查看本站专题:《Python Socket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》

希望本文所述对大家Python程序设计有所帮助。

(责任编辑:admin)






帮助中心
会员注册
找回密码
新闻中心
快捷通道
域名登录面板
虚机登录面板
云主机登录面板
关于我们
关于我们
联系我们
联系方式

售前咨询:17830004266(重庆移动)

企业QQ:383546523

《中华人民共和国工业和信息化部》 编号:ICP备00012341号

Copyright © 2002 -2018 香港云主机 版权所有
声明:香港云主机品牌标志、品牌吉祥物均已注册商标,版权所有,窃用必究

云官方微信

在线客服

  • 企业QQ: 点击这里给我发消息
  • 技术支持:383546523

  • 公司总台电话:17830004266(重庆移动)
  • 售前咨询热线:17830004266(重庆移动)