Python爬取国外天气预报网站的方法
时间:2021-12-08 14:53 作者:admin610456
本文实例讲述了python/' target='_blank'>python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8import httplibimport urllib2import timefrom threading import Threadimport threadingfrom Queue import Queuefrom time import sleepimport reimport copylang = "fr"count = 0class Location: # Location(False, "中国", "北京", "zh") # Location(True, "", "亚洲", "zh") def __init__(self, is_beyond_country, country_name, loc_name, lang): self.country_name = country_name self.loc_name = loc_name self.lang = lang self.is_beyond_country = is_beyond_countryprn_lock = threading.RLock()def GetLocationURLs(url, recursive): global count if url.find("weather-forecast") != -1: count = count + 1 if count % 500 == 0: prn_lock.acquire() print "count:%d" % (count) prn_lock.release() return [url] page = urllib2.urlopen(url).read() time.sleep(0.01) #"<h6><a href=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>" pattern = "<h6><a href=\"(.*)\"><em>(.*)</em></a></h6>" locs = re.findall(pattern, page) locs = [(url, name) for url, name in locs if url.find("browse-locations") != -1 or url.find("weather-forecast") != -1] if not recursive: urls = [url for url, name in locs] return urls urls = [] for _url, _name in locs: lst = GetLocationURLs(_url, True) urls.extend(lst) return urls#entry_url = "http://www.accuweather.com/zh/browse-locations"entry_url = "http://www.accuweather.com/%s/browse-locations/eur/fr" % (lang)#regions = ["afr", "ant", "arc", "asi", "cac", "eur", "mea", "nam", "ocn", "sam"]#regions = ["eur"]#region_urls = [ "%s/%s" % (entry_url, reg) for reg in regions]#region_urls = ["http://www.accuweather.com/zh/browse-locations/eur/fr"]sub_urls = GetLocationURLs(entry_url, False)print len(sub_urls)print sub_urlsq = Queue()location_urls = []ThreadNum = 5lock = threading.RLock()for url in sub_urls: q.put(url)def working(): while True: url = q.get() lst = GetLocationURLs(url, True) print "%s %d urls " % (url, len(lst)) lock.acquire() location_urls.extend(lst) lock.release() q.task_done()for i in range(ThreadNum): t = Thread(target=working) t.setDaemon(True) t.start()q.join() fp = open('locations.txt', "w")fp.write("\n".join(location_urls))fp.close()#for url in location_urls:# print url#location_urls = GetLocationURLs(entry_url)'''def Fetch(url): try: print url web_path = url[0] local_name = url[1] print "web_path:", web_path print "local_name:", local_name sContent = urllib2.urlopen(web_path).read() savePath = "D:\\Course\\NLP_Manning\\%s" % (local_name) print savePath file = open(savePath,'wb') file.write(sContent) file.close() print savePath + " saved"; except: pass;def working(): while True: url = q.get() Fetch(url) sleep(10) q.task_done()#root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"root_url = "https://class.coursera.org/nlp/lecture/index?lecture_player=flash"page = urllib2.urlopen(root_url).read()for i in range(NUM): t = Thread(target=working) t.setDaemon(True) t.start()urls = copy.deepcopy(ppt_urls)urls.extend(srt_urls)urls.extend(video_urls)print len(ppt_urls)print len(srt_urls)print len(video_urls)print len(urls)for url in urls: q.put(url)q.join()''''''root_url = "http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494"page = urllib2.urlopen(root_url).read()print page'''
FetchLocation.py如下:
#encoding=utf-8import sysimport httplibimport urllib2import timefrom threading import Threadimport threadingfrom Queue import Queuefrom time import sleepimport reimport copyfrom xml.dom import minidomimport HTMLParserimport datetimeq = Queue()locks = [threading.RLock() for i in range(2)]ThreadNumber = 20locations = {}conds = {}def FindCountryBreadCrumbs(page): lines = page.splitlines() count = 0 start = -1 opened = False for line in lines: if line.find("<ul id=\"country-breadcrumbs\">") != -1: start = count opened = True if opened and line.find("</ul>") != -1: end = count opened = False count = count + 1 return "\n".join(lines[start: (end + 1)])def GetText(nodelist): rc = [] for node in nodelist: if node.nodeType == node.TEXT_NODE: rc.append(HTMLParser.HTMLParser().unescape(node.data)) return ''.join(rc)def FindCondition(page): pat = "<span class=\"cond\">(.*?)</span>" cds = re.findall(pat, page) cds = [HTMLParser.HTMLParser().unescape(cd).encode("utf-8") for cd in cds] return cds def ExtractInfo(url): try: page = urllib2.urlopen(url).read() except Exception, e: return [] text = FindCountryBreadCrumbs(page) text = HTMLParser.HTMLParser().unescape(text) dom = minidom.parseString(text.encode("utf-8")) locs = [] lis = dom.getElementsByTagName("li") for li in lis: adr_list = li.getElementsByTagName("a") if adr_list: locs.append(GetText(adr_list[0].childNodes).encode("utf-8")) strs = li.getElementsByTagName("strong") if strs: locs.append(GetText(strs[0].childNodes).encode("utf-8")) cds = FindCondition(page) return locs, cdsdef AddMap(lst, m): for x in lst: if m.get(x) == None: m[x] = 1def working(): while True: urls = q.get() #print len(urls) m = {} m2 = {} count = 0 for url in urls: count = count + 1 #print "%d/%d" % (count, len(urls)) locs, cds = ExtractInfo(url) AddMap(locs, m) AddMap(cds, m2) locks[1].acquire() AddMap(m.keys(), locations) AddMap(m2.keys(), conds) locks[1].release() q.task_done()def main(): if len(sys.argv) < 2: exit() loc_path = sys.argv[1] fp = open(loc_path, "r") urls = [line.strip() for line in fp] fp.close() #urls = urls[0:1000] blocks = len(urls) / ThreadNumber + 1 for start in range(0, len(urls), blocks): end = start + blocks if end > len(urls): end = len(urls) q.put(urls[start:end]) for i in range(ThreadNumber): t = Thread(target=working) t.setDaemon(True) t.start() q.join() fp = open("location_name.fr", "w") fp.write("\n".join(locations.keys())) fp.close() fp = open("conditions.fr", "w") fp.write("\n".join(conds.keys())) fp.close()if __name__ == '__main__': main()
希望本文所述对大家的Python程序设计有所帮助。
(责任编辑:admin)