用到的函数 URLsParser.py:
from sgmllib import SGMLParser, SGMLParseError
import urllib
import urlparse
import time, sys

class URLLister(SGMLParser):
    
def reset(self):
        SGMLParser.reset(self)
        self.urls 
= []
        
    
def start_a(self, attrs):
        href 
= [v for k, v in attrs if k =='href']
        
if href:
            self.urls.extend(href)
        
    
def start_img(self, attrs):
        href 
= [v for k, v in attrs if k =='src']
        
if href:
            self.urls.extend(href)
            
    
def start_go(self, attrs):
        href 
= [v for k, v in attrs if k =='href']
        
if href:
            self.urls.extend(href)
            
def URLFormat(current, relative):
    currenturl 
= 'http://' + urlparse.urlsplit(current)[1]
    
#current[:current.rfind('/')]
    relativeurl = relative.strip()
    
    
if relativeurl[:3<> '../':
        
if relativeurl[:7== 'http://':
            
return relativeurl
        
elif relativeurl[:2== './':
            
return currenturl + relativeurl[1:]
        
else:
            
return currenturl + '/' + relativeurl
    
    
while relativeurl[:3== '../':
        url 
= current[:current.rfind('/'+ 1]
        relativeurl 
= relativeurl[2:]
        
if url == 'http://'break
        currenturl 
= current[:current.rfind('/'+ 1]
        
    

    
return currenturl + relativeurl
            
def URLsParser(url):
    result 
= []
    ifNext 
= False
    
    
if urlparse.urlsplit(url)[1== 'bbs.roboo.com':
        
return result
    
    parser 
= URLLister()
    
try:
        usock 
= urllib.urlopen(url) 
        txtBody 
= usock.read()
    
except:
        
print 'Can not connect the Page:', url
        
return -1
    
    
if txtBody.find('>下页</a>'):
        ifNext 
= True
    
    
try:
        parser.feed(txtBody)
    
except SGMLParseError, message:
        
#print url, ':::', message
        return -2
    
    usock.close()
        
    
if not parser.urls:
        
return -2
    
    
for curl in parser.urls:
        curl 
= URLFormat(url, curl)
        result.append(curl)
        
    
return result, ifNext  


主程序 DDetecter.py:
#-*- coding: utf-8 -*-
import threading, urllib, os, socket
from Queue import Queue
from URLsParser import URLsParser
from time import sleep

WIP 
= Queue()
Finished 
= False
ItemsPerPage 
= {'ring'8'search'8'video'4'theme'4'image'3'game'8'sms'8'book'8}
WMLName 
= {'ring''Sring.wml',
           
'search''Search.wml'
           
'video''Svideo.wml'
           
'theme''Stheme.wml'
           
'image''Simage.wml'
           
'game''Sgame.wml'
           
'sms''Sms.wml'
           
'book''Sread.wml'}
URLS_DEAD 
= []
lock 
= threading.RLock()
URL_HOME 
= 'http://wap.roboo.com/'

def logDeadUrlsToFile(strs):
    fhandle 
= open(rootdir+'/Report/urls_dead.txt''a'
    fhandle.write(strs[0] 
+ ',' + strs[1+ ',' +strs[2+ '\n'
    fhandle.close()

class threadDetecter(threading.Thread):
    
def __init__(self, name):    
        self._urls 
= []
        self._thePage 
= []
        self._theKeyWord 
= []    
        threading.Thread.
__init__(self, name = name)
        
    
def _splitParameters(self, wip):
        self._urls.append(wip[0])
        self._thePage.append(wip[
1])
        self._theKeyWord.append(wip[
2])
        
    
def run(self):
        
global WIP
        
print self.getName() + ':''Done.''WIP:', WIP.qsize()
        
        
if WIP.qsize() < 100:
            
for i in range(WIP.qsize()):
                tupleWIP 
= WIP.get(timeout=3)
                self._splitParameters(tupleWIP)
        
else:
            
for i in range(100):
                tupleWIP 
= WIP.get(timeout=3)
                self._splitParameters(tupleWIP)
        
        
for url in self._urls:        
            
try:
                usock 
= urllib.urlopen(url)
            
except:
                tupleResult 
= (url, self._thePage[self._urls.index(url)], self._theKeyWord[self._urls.index(url)])
                
try:
                    lock.acquire()
                    URLS_DEAD.append(tupleResult)
                    logDeadUrlsToFile(tupleResult)
                
finally:
                    lock.release()
        
                
class threadSpider(threading.Thread):
    
def __init__(self, name, keyword, channel = 'ring'):
        self._itemsPerPage 
= ItemsPerPage[channel]
        self._url 
= URL_HOME + WMLName[channel] + '?st=%d&q=%s'
        self._keyword 
= keyword
        self._pageNumber 
= 0
        threading.Thread.
__init__(self, name = name)
        
    
def run(self):
        
global WIP, Finished
        _key 
= urllib.quote(self._keyword)
        _ifNext 
= True        
        
        
while self._pageNumber < 150:
            
print self.getName() + ':''Threads in use:', threading.activeCount(), 'thePage is:', self._pageNumber, 'WIP:', WIP.qsize()
            
            
if WIP.qsize() > 1000:
                sleep(
2)
                
continue
            
            self._pageNumber 
+= 1
            
if _ifNext:
                _url 
= self._url % (self._pageNumber * self._itemsPerPage, _key)
                AllLinks,_ifNext 
= URLsParser(_url)
                
if type(AllLinks) == int and AllLinks == -1:
                    logDeadUrlsToFile([
'Can not open the page %s' % _url])
                    
break
                
elif type(AllLinks) == list:
                    
for url in AllLinks:
                        
try:
                            lock.acquire()
                            WIP.put((url, _url, self._keyword))
                        
finally:
                            lock.release()
                
else:
                    
break
            
else:
                
break    
        
print self.getName() + ':''Spide finished.'   
                
if __name__ == '__main__':
    socket.setdefaulttimeout(
30)
    
    rootdir 
= os.path.dirname(__file__)
    
if not os.path.isdir(rootdir + '/Report'):
        os.chdir(rootdir)
        os.mkdir(
'Report')
    
    spider 
= threadSpider('spider_serach''周杰伦', channel='search')
    spider.start()
    
#spider.join()
    
    sleep(
3
    
    
while 1:
        sleep(
3)
        
print 'Main:''Threads in use:', threading.activeCount(), 'WIP:', WIP.qsize(), 'URLS_DEAD:', len(URLS_DEAD)
                
        
if threading.activeCount() < 2 and WIP.qsize() <= 0:
            
break
        
        
if WIP.qsize() > 100:
            
for i in range(10 - threading.activeCount()+1):
                listdetecters 
= []
                detecter 
= threadDetecter('detecter' + str(i))
                listdetecters.append(detecter)
                detecter.start()
                
            
for detecter in listdetecters:
                detecter.join(
10)
        
else:
            
continue    
        
    
print '^_^ The search result analyse finished.'