香港云主机最佳企业级服务商!

ADSL拨号VPS包含了中国大陆(联通,移动,电信,)

中国香港,国外拨号VPS。

当前位置:云主机 > python >

电信ADSL拨号VPS
联通ADSL拨号VPS
移动ADSL拨号VPS

Python爬取国外天气预报网站的方法


时间:2021-12-08 14:53 作者:admin610456


本文实例讲述了python/' target='_blank'>python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:

crawl_weather.py如下:

#encoding=utf-8import httplibimport urllib2import timefrom threading import Threadimport threadingfrom Queue import Queuefrom time import sleepimport reimport copylang = "fr"count = 0class Location:  # Location(False, "中国", "北京", "zh")  # Location(True, "", "亚洲", "zh")  def __init__(self, is_beyond_country, country_name, loc_name, lang):    self.country_name = country_name    self.loc_name = loc_name    self.lang = lang    self.is_beyond_country = is_beyond_countryprn_lock = threading.RLock()def GetLocationURLs(url, recursive):  global count  if url.find("weather-forecast") != -1:    count = count + 1    if count % 500 == 0:      prn_lock.acquire()      print "count:%d" % (count)      prn_lock.release()    return [url]  page = urllib2.urlopen(url).read()  time.sleep(0.01)  #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>"  pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>"  locs = re.findall(pattern, page)  locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1]  if not recursive:    urls = [url for url, name in locs]    return urls  urls = []  for _url, _name in locs:    lst = GetLocationURLs(_url, True)    urls.extend(lst)  return urls#entry_url = "http://www.accuweather.com/zh/browse-locations"entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]#regions = ["eur"]#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]sub_urls = GetLocationURLs(entry_url, False)print len(sub_urls)print sub_urlsq = Queue()location_urls = []ThreadNum = 5lock = threading.RLock()for url in sub_urls:  q.put(url)def working():  while True:    url = q.get()    lst = GetLocationURLs(url, True)    print "%s %d urls " % (url, len(lst))    lock.acquire()    location_urls.extend(lst)    lock.release()    q.task_done()for i in range(ThreadNum):  t = Thread(target=working)  t.setDaemon(True)  t.start()q.join()  fp = open('locations.txt', "w")fp.write("\n".join(location_urls))fp.close()#for url in location_urls:#  print url#location_urls = GetLocationURLs(entry_url)'''def Fetch(url):  try:    print url    web_path = url[0]    local_name = url[1]       print "web_path:", web_path    print "local_name:", local_name    sContent = urllib2.urlopen(web_path).read()    savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name)    print savePath    file = open(savePath,'wb')    file.write(sContent)    file.close()    print savePath + " saved";  except:    pass;def working():  while True:    url = q.get()    Fetch(url)    sleep(10)    q.task_done()#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"page = urllib2.urlopen(root_url).read()for i in range(NUM):  t = Thread(target=working)  t.setDaemon(True)  t.start()urls = copy.deepcopy(ppt_urls)urls.extend(srt_urls)urls.extend(video_urls)print len(ppt_urls)print len(srt_urls)print len(video_urls)print len(urls)for url in urls:  q.put(url)q.join()''''''root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"page = urllib2.urlopen(root_url).read()print page'''

FetchLocation.py如下:

#encoding=utf-8import sysimport httplibimport urllib2import timefrom threading import Threadimport threadingfrom Queue import Queuefrom time import sleepimport reimport copyfrom xml.dom import minidomimport HTMLParserimport datetimeq = Queue()locks = [threading.RLock() for i in range(2)]ThreadNumber = 20locations = {}conds = {}def FindCountryBreadCrumbs(page):  lines = page.splitlines()  count = 0  start = -1  opened = False  for line in lines:    if line.find("<ul id=\"country-breadcrumbs\">") != -1:      start = count      opened = True    if opened and line.find("</ul>") != -1:      end = count      opened = False    count = count + 1  return "\n".join(lines[start: (end + 1)])def GetText(nodelist):  rc = []  for node in nodelist:    if node.nodeType == node.TEXT_NODE:      rc.append(HTMLParser.HTMLParser().unescape(node.data))  return ''.join(rc)def FindCondition(page):  pat = "<span class=\"cond\">(.*?)</span>"  cds = re.findall(pat, page)  cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds]  return cds  def ExtractInfo(url):  try:    page = urllib2.urlopen(url).read()  except Exception, e:    return []  text = FindCountryBreadCrumbs(page)  text = HTMLParser.HTMLParser().unescape(text)  dom = minidom.parseString(text.encode("utf-8"))  locs = []  lis = dom.getElementsByTagName("li")  for li in lis:    adr_list = li.getElementsByTagName("a")    if adr_list:      locs.append(GetText(adr_list[0].childNodes).encode("utf-8"))    strs = li.getElementsByTagName("strong")    if strs:      locs.append(GetText(strs[0].childNodes).encode("utf-8"))  cds = FindCondition(page)  return locs, cdsdef AddMap(lst, m):  for x in lst:    if m.get(x) == None:      m[x] = 1def working():  while True:    urls = q.get()    #print len(urls)    m = {}    m2 = {}    count = 0    for url in urls:      count = count + 1      #print "%d/%d" % (count, len(urls))      locs, cds = ExtractInfo(url)      AddMap(locs, m)      AddMap(cds, m2)    locks[1].acquire()    AddMap(m.keys(), locations)    AddMap(m2.keys(), conds)    locks[1].release()    q.task_done()def main():  if len(sys.argv) < 2:    exit()  loc_path = sys.argv[1]  fp = open(loc_path, "r")  urls = [line.strip() for line in fp]  fp.close()  #urls = urls[0:1000]  blocks = len(urls) / ThreadNumber + 1  for start in range(0, len(urls), blocks):    end = start + blocks    if end > len(urls):      end = len(urls)    q.put(urls[start:end])  for i in range(ThreadNumber):    t = Thread(target=working)    t.setDaemon(True)    t.start()  q.join()  fp = open("location_name.fr", "w")  fp.write("\n".join(locations.keys()))  fp.close()  fp = open("conditions.fr", "w")  fp.write("\n".join(conds.keys()))  fp.close()if __name__ == '__main__':  main()

希望本文所述对大家的Python程序设计有所帮助。

(责任编辑:admin)






帮助中心
会员注册
找回密码
新闻中心
快捷通道
域名登录面板
虚机登录面板
云主机登录面板
关于我们
关于我们
联系我们
联系方式

售前咨询:17830004266(重庆移动)

企业QQ:383546523

《中华人民共和国工业和信息化部》 编号:ICP备00012341号

Copyright © 2002 -2018 香港云主机 版权所有
声明:香港云主机品牌标志、品牌吉祥物均已注册商标,版权所有,窃用必究

云官方微信

在线客服

  • 企业QQ: 点击这里给我发消息
  • 技术支持:383546523

  • 公司总台电话:17830004266(重庆移动)
  • 售前咨询热线:17830004266(重庆移动)