data:image/s3,"s3://crabby-images/e5f50/e5f50d24172be8b20579101161ed4fb9861f2bfa" alt="" data:image/s3,"s3://crabby-images/eb007/eb0079dbb5ad7778bfa30282ece2c63d47a642a3" alt="" 1 import urlparse, csv 2 import threading 3 from operator import itemgetter 4data:image/s3,"s3://crabby-images/206aa/206aa0225c7a41e6c057cd5b0248a194db14fbec" alt="" 5 DICTHOSTS = {} 6data:image/s3,"s3://crabby-images/206aa/206aa0225c7a41e6c057cd5b0248a194db14fbec" alt="" 7 class CSVReader(threading.Thread): 8 def __init__(self, threadname, filename): 9 self._file = filename 10 threading.Thread.__init__(self, name = threadname) 11 12 def run(self): 13 global DICTHOSTS 14 try: 15 _fhandle = open(self._file, 'r') 16 for line in _fhandle: 17 try: 18 host = urlparse.urlsplit(line.split('","')[1])[1] 19 except IndexError: 20 pass 21 22 if host: 23 if DICTHOSTS.has_key(host): 24 DICTHOSTS[host] += 1 25 else: 26 DICTHOSTS[host] = 1 27 finally: 28 _fhandle.close() 29 30 print self.getName(), self._file, 'has been traversed.', 'DICTHOSTS has', len(DICTHOSTS), 'items.' 31 32 if __name__ == '__main__': 33 sortedList = [] 34 path = 'E:/workspace/URLsSorter/src/titleurl/' 35 for i in range(1, 21): 36 si = str(i) 37 reader = CSVReader('Thread' + si, path + 'titleurl' + si + '.csv') 38 reader.start() 39 reader.join() 40data:image/s3,"s3://crabby-images/206aa/206aa0225c7a41e6c057cd5b0248a194db14fbec" alt="" 41 42 sortedList = sorted(DICTHOSTS.items(), key = itemgetter(1), reverse = True) 43 44 writter = csv.writer(open(path + 'Result.csv', 'wb')) 45 writter.writerows(sortedList)
|
|
|
| 日 | 一 | 二 | 三 | 四 | 五 | 六 |
---|
30 | 31 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
|
导航
统计
- 随笔: 42
- 文章: 0
- 评论: 12
- 引用: 0
常用链接
留言簿(1)
随笔分类(32)
随笔档案(42)
文章分类
最新随笔
最新评论
data:image/s3,"s3://crabby-images/1813e/1813e4ba15fd5a936e0ad0a3a2d332b01f5e40c0" alt=""
阅读排行榜
评论排行榜
|
|