用到的函数 URLsParser.py:
from sgmllib import SGMLParser, SGMLParseError
import urllib
import urlparse
import time, sys

class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k =='href']
if href:
self.urls.extend(href)
def start_img(self, attrs):
href = [v for k, v in attrs if k =='src']
if href:
self.urls.extend(href)
def start_go(self, attrs):
href = [v for k, v in attrs if k =='href']
if href:
self.urls.extend(href)
def URLFormat(current, relative):
currenturl = 'http://' + urlparse.urlsplit(current)[1]
#current[:current.rfind('/')]
relativeurl = relative.strip()
if relativeurl[:3] <> '../':
if relativeurl[:7] == 'http://':
return relativeurl
elif relativeurl[:2] == './':
return currenturl + relativeurl[1:]
else:
return currenturl + '/' + relativeurl
while relativeurl[:3] == '../':
url = current[:current.rfind('/') + 1]
relativeurl = relativeurl[2:]
if url == 'http://': break
currenturl = current[:current.rfind('/') + 1]

return currenturl + relativeurl
def URLsParser(url):
result = []
ifNext = False
if urlparse.urlsplit(url)[1] == 'bbs.roboo.com':
return result
parser = URLLister()
try:
usock = urllib.urlopen(url)
txtBody = usock.read()
except:
print 'Can not connect the Page:', url
return -1
if txtBody.find('>下页</a>'):
ifNext = True
try:
parser.feed(txtBody)
except SGMLParseError, message:
#print url, ':::', message
return -2
usock.close()
if not parser.urls:
return -2
for curl in parser.urls:
curl = URLFormat(url, curl)
result.append(curl)
return result, ifNext
主程序 DDetecter.py:
#-*- coding: utf-8 -*-
import threading, urllib, os, socket
from Queue import Queue
from URLsParser import URLsParser
from time import sleep

WIP = Queue()
Finished = False
ItemsPerPage = {'ring': 8, 'search': 8, 'video': 4, 'theme': 4, 'image': 3, 'game': 8, 'sms': 8, 'book': 8}
WMLName = {'ring': 'Sring.wml',
'search': 'Search.wml',
'video': 'Svideo.wml',
'theme': 'Stheme.wml',
'image': 'Simage.wml',
'game': 'Sgame.wml',
'sms': 'Sms.wml',
'book': 'Sread.wml'}
URLS_DEAD = []
lock = threading.RLock()
URL_HOME = 'http://wap.roboo.com/'

def logDeadUrlsToFile(strs):
fhandle = open(rootdir+'/Report/urls_dead.txt', 'a')
fhandle.write(strs[0] + ',' + strs[1] + ',' +strs[2] + '\n')
fhandle.close()

class threadDetecter(threading.Thread):
def __init__(self, name):
self._urls = []
self._thePage = []
self._theKeyWord = []
threading.Thread.__init__(self, name = name)
def _splitParameters(self, wip):
self._urls.append(wip[0])
self._thePage.append(wip[1])
self._theKeyWord.append(wip[2])
def run(self):
global WIP
print self.getName() + ':', 'Done.', 'WIP:', WIP.qsize()
if WIP.qsize() < 100:
for i in range(WIP.qsize()):
tupleWIP = WIP.get(timeout=3)
self._splitParameters(tupleWIP)
else:
for i in range(100):
tupleWIP = WIP.get(timeout=3)
self._splitParameters(tupleWIP)
for url in self._urls:
try:
usock = urllib.urlopen(url)
except:
tupleResult = (url, self._thePage[self._urls.index(url)], self._theKeyWord[self._urls.index(url)])
try:
lock.acquire()
URLS_DEAD.append(tupleResult)
logDeadUrlsToFile(tupleResult)
finally:
lock.release()
class threadSpider(threading.Thread):
def __init__(self, name, keyword, channel = 'ring'):
self._itemsPerPage = ItemsPerPage[channel]
self._url = URL_HOME + WMLName[channel] + '?st=%d&q=%s'
self._keyword = keyword
self._pageNumber = 0
threading.Thread.__init__(self, name = name)
def run(self):
global WIP, Finished
_key = urllib.quote(self._keyword)
_ifNext = True
while self._pageNumber < 150:
print self.getName() + ':', 'Threads in use:', threading.activeCount(), 'thePage is:', self._pageNumber, 'WIP:', WIP.qsize()
if WIP.qsize() > 1000:
sleep(2)
continue
self._pageNumber += 1
if _ifNext:
_url = self._url % (self._pageNumber * self._itemsPerPage, _key)
AllLinks,_ifNext = URLsParser(_url)
if type(AllLinks) == int and AllLinks == -1:
logDeadUrlsToFile(['Can not open the page %s' % _url])
break
elif type(AllLinks) == list:
for url in AllLinks:
try:
lock.acquire()
WIP.put((url, _url, self._keyword))
finally:
lock.release()
else:
break
else:
break
print self.getName() + ':', 'Spide finished.'
if __name__ == '__main__':
socket.setdefaulttimeout(30)
rootdir = os.path.dirname(__file__)
if not os.path.isdir(rootdir + '/Report'):
os.chdir(rootdir)
os.mkdir('Report')
spider = threadSpider('spider_serach', '周杰伦', channel='search')
spider.start()
#spider.join()
sleep(3)
while 1:
sleep(3)
print 'Main:', 'Threads in use:', threading.activeCount(), 'WIP:', WIP.qsize(), 'URLS_DEAD:', len(URLS_DEAD)
if threading.activeCount() < 2 and WIP.qsize() <= 0:
break
if WIP.qsize() > 100:
for i in range(10 - threading.activeCount()+1):
listdetecters = []
detecter = threadDetecter('detecter' + str(i))
listdetecters.append(detecter)
detecter.start()
for detecter in listdetecters:
detecter.join(10)
else:
continue
print '^_^ The search result analyse finished.'
|
|
|
| 日 | 一 | 二 | 三 | 四 | 五 | 六 |
---|
25 | 26 | 27 | 28 | 29 | 30 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 1 | 2 | 3 | 4 | 5 |
|
导航
统计
- 随笔: 42
- 文章: 0
- 评论: 12
- 引用: 0
常用链接
留言簿(1)
随笔分类(32)
随笔档案(42)
文章分类
最新随笔
最新评论

阅读排行榜
评论排行榜
|
|