1. URLsParser.py:
from sgmllib import SGMLParser, SGMLParseError import urllib import urlparse import time, sys
class URLLister(SGMLParser): def reset(self): SGMLParser.reset(self) self.urls = [] def start_a(self, attrs): href = [v for k, v in attrs if k =='href'] if href: self.urls.extend(href) def start_img(self, attrs): href = [v for k, v in attrs if k =='src'] if href: self.urls.extend(href) def start_go(self, attrs): href = [v for k, v in attrs if k =='href'] if href: self.urls.extend(href) def URLFormat(current, relative): currenturl = 'http://' + urlparse.urlsplit(current)[1] #current[:current.rfind('/')] relativeurl = relative.strip() if relativeurl[:3] <> '../': if relativeurl[:7] == 'http://': return relativeurl elif relativeurl[:2] == './': return currenturl + relativeurl[1:] else: return currenturl + '/' + relativeurl while relativeurl[:3] == '../': url = current[:current.rfind('/') + 1] relativeurl = relativeurl[2:] if url == 'http://': break currenturl = current[:current.rfind('/') + 1]
return currenturl + relativeurl def URLsParser(url): result = [] # if urlparse.urlsplit(url)[1] == 'bbs.roboo.com': # return result parser = URLLister() try: usock = urllib.urlopen(url) except: print 'Can not connect the Page:', url return -1 try: parser.feed(usock.read()) except SGMLParseError, message: #print url, ':::', message return -2 usock.close() if not parser.urls: return -2 for curl in parser.urls: curl = URLFormat(url, curl) result.append(curl) return result
2. 主程序 Spider.py:
import re, urllib, threading, urlparse from lib.URLsParser import URLsParser from Queue import Queue from time import sleep import csv, os
URLSTRAPPED = [] URLS = [] DEADURLS = [] WORKFLOW = Queue() WIPCount = 0 lock = threading.RLock() ROOT = 'http://www.roboo.com' HOST = urlparse.urlsplit(ROOT)[1].split('.', 1)[1:][0] Finished = False
class ThreadSpider(threading.Thread): def __init__(self, name): threading.Thread.__init__(self, name = name) def run(self): global URLS, URLSTRAPPED, WORKFLOW, DEADURLS, WIPCount, Finished if WIPCount > 0: _urlandparent = WORKFLOW.get() lock.acquire() WIPCount -= 1 lock.release() else: print 'Work in Process have been finished.' Finished = True return AllLinks = URLsParser(_urlandparent[0]) if type(AllLinks) == int and AllLinks == -1: fhandle = open(rootdir+'/Report/urls_dead.txt', 'a') fhandle.write(str(_urlandparent[0]) + ',' + str(_urlandparent[1]) + '\n') fhandle.close() DEADURLS.append((_urlandparent[0], _urlandparent[1])) return elif type(AllLinks) == list: URLS += AllLinks else: return for url in AllLinks: try: host = urlparse.urlsplit(url)[1].split('.', 1)[1:][0] except: host = HOST if host == HOST and _urlandparent[0] not in URLSTRAPPED: lock.acquire() WIPCount += 1 lock.release() WORKFLOW.put((url, _urlandparent[0])) lock.acquire() URLSTRAPPED.append(_urlandparent[0]) lock.release()
class Spider(): def __init__(self, rooturl, maxthreads = 20): _mainhost = urlparse.urlsplit(rooturl)[1] self.root = rooturl self.host = HOST self._MAXTHREADS = maxthreads def Trapping(self): global WIPCount, URLS, URLSTRAPPED threadpool = [] n = 0 AllLinks = URLsParser(self.root) try: URLS += AllLinks except: pass for url in AllLinks: try: host = urlparse.urlsplit(url)[1].split('.', 1)[1:][0] except: host = self.host if host == self.host: WORKFLOW.put((url, self.root)) WIPCount += 1 i = 0 while not Finished: print i, 'Threads in use:', threading.activeCount(), 'WIP:', WIPCount, 'Trapped URLS:', len(URLSTRAPPED), 'Dead_URLS:', len(DEADURLS) i = i % 100 + 1 if WIPCount < 1: break elif threading.activeCount() > self._MAXTHREADS: sleep(3) continue else: n = n % self._MAXTHREADS + 1 tspider = ThreadSpider('threadSpider' + str(n)) tspider.start() tspider.join() print 'All task done.' if __name__ == '__main__': rootdir = os.path.dirname(__file__) if not os.path.isdir(rootdir + '/Report'): os.chdir(rootdir) os.mkdir('Report') spider = Spider(ROOT, maxthreads = 20) spider.Trapping() fhandle = open(rootdir+'/Report/urls_trapped.txt', 'w') for every in URLSTRAPPED: fhandle.write(every + '\n') fhandle.close() fhandle = open(rootdir+'/Report/urls.txt', 'w') for url in URLS: fhandle.write(url + '\n') fhandle.close() print '~_^: Action: Statistic save finished.'
