用到的函数 URLsParser.py:
from sgmllib import SGMLParser, SGMLParseError import urllib import urlparse import time, sys
class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] def start_a(self, attrs): href = [v for k, v in attrs if k =='href'] if href: self.urls.extend(href) def start_img(self, attrs): href = [v for k, v in attrs if k =='src'] if href: self.urls.extend(href) def start_go(self, attrs): href = [v for k, v in attrs if k =='href'] if href: self.urls.extend(href) def URLFormat(current, relative): currenturl = 'http://' + urlparse.urlsplit(current)[1] #current[:current.rfind('/')] relativeurl = relative.strip() if relativeurl[:3] <> '../': if relativeurl[:7] == 'http://': return relativeurl elif relativeurl[:2] == './': return currenturl + relativeurl[1:] else: return currenturl + '/' + relativeurl while relativeurl[:3] == '../': url = current[:current.rfind('/') + 1] relativeurl = relativeurl[2:] if url == 'http://': break currenturl = current[:current.rfind('/') + 1]
return currenturl + relativeurl def URLsParser(url): result = [] ifNext = False if urlparse.urlsplit(url)[1] == 'bbs.roboo.com': return result parser = URLLister() try: usock = urllib.urlopen(url) txtBody = usock.read() except: print 'Can not connect the Page:', url return -1 if txtBody.find('>下页</a>'): ifNext = True try: parser.feed(txtBody) except SGMLParseError, message: #print url, ':::', message return -2 usock.close() if not parser.urls: return -2 for curl in parser.urls: curl = URLFormat(url, curl) result.append(curl) return result, ifNext
主程序 DDetecter.py:
#-*- coding: utf-8 -*- import threading, urllib, os, socket from Queue import Queue from URLsParser import URLsParser from time import sleep
WIP = Queue() Finished = False ItemsPerPage = {'ring': 8, 'search': 8, 'video': 4, 'theme': 4, 'image': 3, 'game': 8, 'sms': 8, 'book': 8} WMLName = {'ring': 'Sring.wml', 'search': 'Search.wml', 'video': 'Svideo.wml', 'theme': 'Stheme.wml', 'image': 'Simage.wml', 'game': 'Sgame.wml', 'sms': 'Sms.wml', 'book': 'Sread.wml'} URLS_DEAD = [] lock = threading.RLock() URL_HOME = 'http://wap.roboo.com/'
def logDeadUrlsToFile(strs): fhandle = open(rootdir+'/Report/urls_dead.txt', 'a') fhandle.write(strs[0] + ',' + strs[1] + ',' +strs[2] + '\n') fhandle.close()
class threadDetecter(threading.Thread): def __init__(self, name): self._urls = [] self._thePage = [] self._theKeyWord = [] threading.Thread.__init__(self, name = name) def _splitParameters(self, wip): self._urls.append(wip[0]) self._thePage.append(wip[1]) self._theKeyWord.append(wip[2]) def run(self): global WIP print self.getName() + ':', 'Done.', 'WIP:', WIP.qsize() if WIP.qsize() < 100: for i in range(WIP.qsize()): tupleWIP = WIP.get(timeout=3) self._splitParameters(tupleWIP) else: for i in range(100): tupleWIP = WIP.get(timeout=3) self._splitParameters(tupleWIP) for url in self._urls: try: usock = urllib.urlopen(url) except: tupleResult = (url, self._thePage[self._urls.index(url)], self._theKeyWord[self._urls.index(url)]) try: lock.acquire() URLS_DEAD.append(tupleResult) logDeadUrlsToFile(tupleResult) finally: lock.release() class threadSpider(threading.Thread): def __init__(self, name, keyword, channel = 'ring'): self._itemsPerPage = ItemsPerPage[channel] self._url = URL_HOME + WMLName[channel] + '?st=%d&q=%s' self._keyword = keyword self._pageNumber = 0 threading.Thread.__init__(self, name = name) def run(self): global WIP, Finished _key = urllib.quote(self._keyword) _ifNext = True while self._pageNumber < 150: print self.getName() + ':', 'Threads in use:', threading.activeCount(), 'thePage is:', self._pageNumber, 'WIP:', WIP.qsize() if WIP.qsize() > 1000: sleep(2) continue self._pageNumber += 1 if _ifNext: _url = self._url % (self._pageNumber * self._itemsPerPage, _key) AllLinks,_ifNext = URLsParser(_url) if type(AllLinks) == int and AllLinks == -1: logDeadUrlsToFile(['Can not open the page %s' % _url]) break elif type(AllLinks) == list: for url in AllLinks: try: lock.acquire() WIP.put((url, _url, self._keyword)) finally: lock.release() else: break else: break print self.getName() + ':', 'Spide finished.' if __name__ == '__main__': socket.setdefaulttimeout(30) rootdir = os.path.dirname(__file__) if not os.path.isdir(rootdir + '/Report'): os.chdir(rootdir) os.mkdir('Report') spider = threadSpider('spider_serach', '周杰伦', channel='search') spider.start() #spider.join() sleep(3) while 1: sleep(3) print 'Main:', 'Threads in use:', threading.activeCount(), 'WIP:', WIP.qsize(), 'URLS_DEAD:', len(URLS_DEAD) if threading.activeCount() < 2 and WIP.qsize() <= 0: break if WIP.qsize() > 100: for i in range(10 - threading.activeCount()+1): listdetecters = [] detecter = threadDetecter('detecter' + str(i)) listdetecters.append(detecter) detecter.start() for detecter in listdetecters: detecter.join(10) else: continue print '^_^ The search result analyse finished.'
|
|
|
| 日 | 一 | 二 | 三 | 四 | 五 | 六 |
---|
25 | 26 | 27 | 28 | 29 | 30 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 1 | 2 | 3 | 4 | 5 |
|
导航
统计
- 随笔: 42
- 文章: 0
- 评论: 12
- 引用: 0
常用链接
留言簿(1)
随笔分类(32)
随笔档案(42)
文章分类
最新随笔
最新评论
阅读排行榜
评论排行榜
|
|