Python实现的文轩网爬虫完整示例
时间:2022-04-02 10:36 作者:admin
本文实例讲述了python/' target='_blank'>python实现的文轩网爬虫。分享给大家供大家参考,具体如下:
encoding=utf8import pymysql/' target='_blank'>mysqlimport timeimport sysimport requestsimport os#捕获错误import tracebackimport types#将html实体化import cgiimport warningsreload(sys)sys.setdefaultencoding('utf-8')from pyquery import PyQuery as pqfrom lxml import etreesys.setdefaultencoding('utf-8')#屏蔽错误warnings.filterwarnings("ignore")#下载图片def dowloadPic(imageUrl,filePath):r = requests.get(imageUrl,timeout=60)status=r.status_codeif status == 404:return 404with open(filePath, "wb") as code:code.write(r.content)#根据详情页地址抓取数据并插入数据库def getData(final_url):file_open=open('./url.txt', 'w')file_open.write(final_url)file_open.close()#链接数据库conn = pyMySQL.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='bookinfo', charset='utf8')#设置浮标cursor = conn.cursor(cursor=pymysql.cursors.DictCursor)#解析详情页面try:detail_url=final_urlc=pq(detail_url)head=c('html').attr('xmlns')err='http://www.w3.org/1999/xhtml'err1='http://www.winxuan.com/cms/2016db_sh'if head == err or head == err1:return 'back'except Exception, e:return 'back'i=0while i<12: text = c('#page').find('.cont').find('li').eq(i).text() text=text.replace(' ','') if 'I S B N' in text: isbn=text.replace('I S B N:','') isbn=isbn.strip() sel='select count(*) from bi_book where isbn ='+isbn cursor.execute(sel) result=cursor.fetchone() count=result['count(*)'] if count != 0 : print u'已存在' return 'back' if 'isbn:' in text : isbn=text.replace('isbn:','') isbn=isbn.strip() sel='select count(*) from bi_book where isbn ='+isbn cursor.execute(sel) result=cursor.fetchone() count=result['count(*)'] if count != 0 : print u'已存在' return 'back' if '作者:' in text : author = text.replace('作者:','') if '出版社:' in text : press_name=text.replace('出版社:','') if '版次:' in text : edition=text.replace('版次:','') if '印次:' in text : impressions=text.replace('印次:','') if '装帧:' in text : packaging=text.replace('装帧:','') if '开本:' in text: size=text.replace('开本:','') if '出版时间:' in text: press_time=text.replace('出版时间:','') press_time=press_time.strip() if press_time == '无': press_time='1970-01-01' if '印刷时间:' in text: print_time=text.replace('印刷时间:','') print_time=print_time.strip() if print_time== '无': print_time='1970-01-01' if '页数:' in text: page_num=text.replace('页数:','') if '字数:' in text: word_num=text.replace('字数:','') i+=1if ('author' in locals().keys()) == False: author = ''if ('press_time' in locals().keys()) == False: press_time = '1970-01-01'if ('print_time' in locals().keys()) == False: print_time = '1970-01-01'if ('impressions' in locals().keys()) == False: impressions = ''if ('edition' in locals().keys())== False: edition = ''if ('page_num' in locals().keys())== False: page_num = ''if ('word_num' in locals().keys())== False: word_num = ''if ('packaging' in locals().keys())== False: packaging = ''if ('size' in locals().keys())== False: size = ''if ('press_name' in locals().keys())== False: press_name = ''#暂无图片地址none_img='http://static.winxuancdn.com/goods/sml_blank.jpg'#获取大小图地址big_path=c('.info-side').find('.img').find('a').find('img').attr('src')if big_path is None: return 'back'elif big_path == none_img : big_path='' small_path=''else : small_path=big_path.replace('_16','_11')#获取分类#先获取a标签htmlahtml=c('#page').find('.base-nav').eq(0).html()#解析a标签htmlcate=pq(ahtml)#获取分类的最后一个分类category=cate('a:last').text()#获取书名name=c('.info-main').find('.name').eq(0).find('h1').eq(0).text()name=name.strip()#获取价格price=c('.info-main').find('.attr').eq(0).find('.price-n').eq(0).find('b').text()price=price.replace('¥','')#循环获取内容简介和目录信息k=5while k<12: title=c('#page').find('.title').eq(k).find('.tab').find('h4').text() if '内容简介' in title: con=c('#page').find('.title').eq(k).nextAll() det=pq(con) content=det('.text-words-1').html() content=content.encode("utf8", "ignore"); if '目录' in title: con=c('#page').find('.title').eq(k).nextAll() dry=pq(con) directory=dry('.text-words-1').html() directory=directory.encode("utf8", "ignore"); k+=1#如果内容简介和目录没有的时候指定为空字符串if ('content' in locals().keys())== False: content = ''if ('directory' in locals().keys())== False: directory = ''details = '内容简介<br>'+content+'<br><br>目录<br>'+directorydetails=cgi.escape(details)#录入时间add_time = time.strftime('%Y-%m-%d',time.localtime(time.time()))#下载小图#文件根目录root_path=sys.path[0]#创建isbn文件夹路径root_path=root_path.replace('\\','/')isbn_path=root_path+'/download/'+isbnif big_path != '' and small_path !='' : #创建isbn目录 if os.path.isdir(isbn_path) ==False : os.mkdir(isbn_path) #组合下载后图片保存路径 down_img_small = isbn_path+"/small"+isbn+".jpg" down_img_big = isbn_path+'/big'+isbn+".jpg" #调用下载图片方法 small_res=dowloadPic(small_path,down_img_small) #大图保存数据库路径 big_res=dowloadPic(big_path,down_img_big) #小图保存数据库路径 if small_res==404 : img_small = 'none-picture/none-small.jpg' else : img_small = 'download/'+isbn+'/small'+isbn+'.jpg' if big_res==404 : img_big = 'none-picture/none-big.jpg' else : img_big = 'download/'+isbn+'/big'+isbn+'.jpg' else : #组合保存数据库中的图片路径 img_small = 'download/'+isbn+'/small'+isbn+'.jpg' img_big = 'download/'+isbn+'/big'+isbn+'.jpg'else : img_big = 'none-picture/none-big.jpg' img_small = 'none-picture/none-small.jpg'source_type = 3try : #要插入的列表 li=[0,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small] #执行sql sql="insert into bi_book (book_id,source_type,category,details,detail_url,price,add_time,packaging,print_time,impressions,name,author,press_name,isbn,edition,size,press_time,page_num,word_num,img_big,img_small) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" aaa=cursor.execute(sql,li) if aaa==1: print u'插入成功' conn.commit()except Exception, e : return 'back'def winxuan(n):#首页解析home_url='http://www.winxuan.com/'h=pq(home_url)#分类导航链接menu=h('.mod-mainmenu').find('dd').find('a').eq(n).attr('href')#print menu#分类书籍首页try:mh=pq(menu)except Exception, e :return 'backs'# text=mh('.main').find('a').text()# text=text.encode("GBK", "ignore");li=[]u=0while u<248 :detail_urls=mh('.main').find('a').eq(u).attr('href')#将取到所有地址放入到列表当中li.append(detail_urls)u+=1#进行列表去重li=list(set(li))for final_url in li:try:result=getData(final_url)except Exception, e :continueif result=='back' :continueprint 'OK,finished'n=0while n<58:while n<58:print nstring=str(n)file_open=open('./number.txt', 'w')file_open.write(string)file_open.close()res=winxuan(n)n+=1if res=='backs' :continue
更多关于Python相关内容可查看本站专题:《Python Socket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》
希望本文所述对大家Python程序设计有所帮助。
(责任编辑:admin)