香港云主机最佳企业级服务商!

ADSL拨号VPS包含了中国大陆(联通,移动,电信,)

中国香港,国外拨号VPS。

当前位置:云主机 > python >

电信ADSL拨号VPS
联通ADSL拨号VPS
移动ADSL拨号VPS

使用python实现抓取腾讯视频所有电影的爬虫


时间:2022-04-02 10:29 作者:admin


python/' target='_blank'>python实现的抓取腾讯视频所有电影的爬虫

# -*- coding: utf-8 -*-import reimport urllib2from bs4import BeautifulSoupimport string, timeimport pymongoNUM   =0 #全局变量,电影数量m_type = u'' #全局变量,电影类型m_site = u'qq' #全局变量,电影网站#根据指定的URL获取网页内容def gethtml(url):  req = urllib2.Request(url)  response = urllib2.urlopen(req)  html = response.read()  return html#从电影分类列表页面获取电影分类def gettags(html):  global m_type  soup = BeautifulSoup(html)   #过滤出分类内容  #print soup  #<ulclass="clearfix _group" gname="mi_type" gtype="1">  tags_all = soup.find_all('ul', {'class' :'clearfix _group' ,'gname' :'mi_type'})  #print len(tags_all), tags_all  #print str(tags_all[1]).replace('\n','')  #<a _hot="tag.sub" class="_gtag _hotkey" href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html" title="动作" tvalue="0">动作</a>  re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'  p = re.compile(re_tags, re.DOTALL)  tags = p.findall(str(tags_all[0]))  if tags:    tags_url = {}    #print tags    for tagin tags:      tag_url = tag[0].decode('utf-8')      #print tag_url      m_type = tag[1].decode('utf-8')      tags_url[m_type] = tag_url  else:      print"Not Find"  return tags_url#获取每个分类的页数def get_pages(tag_url):  tag_html = gethtml(tag_url)  #divclass="paginator  soup = BeautifulSoup(tag_html)   #过滤出标记页面的html  #print soup  #<divclass="mod_pagenav" id="pager">  div_page = soup.find_all('div', {'class' :'mod_pagenav','id' :'pager'})  #print div_page #len(div_page), div_page[0]  #<aclass="c_txt6" href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html" title="25"><span>25</span></a>  re_pages = r'<a class=.+?><span>(.+?)</span></a>'  p = re.compile(re_pages, re.DOTALL)  pages = p.findall(str(div_page[0]))  #print pages  if len(pages) >1:    return pages[-2]  else:    return 1def getmovielist(html):  soup = BeautifulSoup(html)  #<ulclass="mod_list_pic_130">  divs = soup.find_all('ul', {'class' :'mod_list_pic_130'})  #print divs  for div_htmlin divs:    div_html = str(div_html).replace('\n','')    #print div_html    getmovie(div_html)def getmovie(html):  global NUM  global m_type  global m_site  re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'  p = re.compile(re_movie, re.DOTALL)  movies = p.findall(html)  if movies:    conn = pymongo.Connection('localhost',27017)    movie_db = conn.dianying    playlinks = movie_db.playlinks    #print movies    for moviein movies:      #print movie      NUM +=1      print"%s : %d" % ("=" *70, NUM)      values = dict(        movie_title = movie[1],        movie_url  = movie[0],        movie_site   = m_site,        movie_type   = m_type        )      print values      playlinks.insert(values)      print"_" *70      NUM +=1      print"%s : %d" % ("=" *70, NUM)  #else:  #  print"Not Find"def getmovieinfo(url):  html = gethtml(url)  soup = BeautifulSoup(html)  #pack pack_album album_cover  divs = soup.find_all('div', {'class' :'pack pack_album album_cover'})  #print divs[0]  #<a href="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html" target="new" title="《血滴子》独家纪录片" wl="1"> </a>  re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'  p_info = re.compile(re_info, re.DOTALL)  m_info = p_info.findall(str(divs[0]))  if m_info:    return m_info  else:    print"Not find movie info"  return m_infodef insertdb(movieinfo):  global conn  movie_db = conn.dianying_at  movies = movie_db.movies  movies.insert(movieinfo)if __name__ =="__main__":  global conn  tags_url ="http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"  #print tags_url  tags_html = gethtml(tags_url)  #print tags_html  tag_urls = gettags(tags_html)  #print tag_urls  for urlin tag_urls.items():    print str(url[1]).encode('utf-8') #,url[0]    maxpage =int(get_pages(str(url[1]).encode('utf-8')))    print maxpage    for xin range(0, maxpage):      #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html      m_url = str(url[1]).replace('0_20_0_-1_0.html','')      movie_url ="%s%d_20_0_-1_0.html" % (m_url, x)      print movie_url      movie_html = gethtml(movie_url.encode('utf-8'))      #print movie_html      getmovielist(movie_html)      time.sleep(0.1)

总结

以上所述是小编给大家介绍的使用Python实现抓取腾讯视频所有电影的爬虫,希望对大家有所帮助,如果大家有任何疑问欢迎给我留言,小编会及时回复大家的!

(责任编辑:admin)






帮助中心
会员注册
找回密码
新闻中心
快捷通道
域名登录面板
虚机登录面板
云主机登录面板
关于我们
关于我们
联系我们
联系方式

售前咨询:17830004266(重庆移动)

企业QQ:383546523

《中华人民共和国工业和信息化部》 编号:ICP备00012341号

Copyright © 2002 -2018 香港云主机 版权所有
声明:香港云主机品牌标志、品牌吉祥物均已注册商标,版权所有,窃用必究

云官方微信

在线客服

  • 企业QQ: 点击这里给我发消息
  • 技术支持:383546523

  • 公司总台电话:17830004266(重庆移动)
  • 售前咨询热线:17830004266(重庆移动)