生成CSV用于测试:
1# -*- coding: utf-8 -*- 2import csv, threading 3import random 4 5URLS = ['http://www.163.com/aaaa/bbb/ccc.jsp&path=1234', 6 'http://wap.roboo.com/pages/c.jsp', 7 'http://www.sina.com.cn/sjf/sfsfsdf/sfsfsafsda.aspx&sdf=sfsdf&sdfsdf=%&%^*', 8 'http://sohu.cn/sjflsjlfjsdf/safsfd/safasfsaf/sf.jsp', 9 'http://tianya.cn/sfsf/sf/sf/sf/s/f/sfs.aaa', 10 'http://download.org.cn/s/fs/f/s/f/a.af', 11 'http://www.roboo.com/sring.jsp/sjalfj.jsp&sdlfajlsf=1493932', 12 'http://houhouhou/alsjdf/sa/f/sa/f.sf', 13 'http://asfsf.cn/safs.asfdd', 14 'http://哦哦/saf.aspx&234..23j4l2j34/safd.aspx', 15 'http://www.263.net/sfsfds/sfsdf.jsp&12323&2123234&23424=23423' 16 ] 17 18TITLES = ['爱莎莎莎', 19 '什么和生命', 20 '生活的意义', 21 '新浪蓝景', 22 '如薄啊,无敌铁', 23 '神龙见附录三顿饭', 24 '我就哦哦将为您', 25 '宁波ihoiha', 26 '阿加莎刻录机服了你', 27 '宁波啊遏体哦你' 28 ] 29 30MAX_TREADS = 10 31global currentThreads 32lock = threading.RLock() 33 34class TCSVGenerater(threading.Thread): 35 def __init__(self, data, filename, mode = 'a'): 36 global currentThreads 37 self._data = data 38 self._filename = filename 39 self._mode = mode 40 lock.acquire() 41 currentThreads += 1 42 lock.release() 43 threading.Thread.__init__(self) 44 45 def run(self): 46 global currentThreads 47 writer = csv.writer(open(self._filename, self._mode + 'b')) 48 writer.writerows(self._data) 49 50 lock.acquire() 51 currentThreads -= 1 52 lock.release() 53 54if __name__ == '__main__': 55 data = [] 56 currentThreads = 0 57 completed = False 58 59 for i in range(20): 60 if currentThreads > MAX_TREADS: 61 continue 62 else: 63 file = 'd:/CSV/titleurl1.csv' 64 for j in range(10000): 65 url = URLS[random.randrange(10)] 66 title = TITLES[random.randrange(10)] 67 data.append((title, url)) 68 69 thread = TCSVGenerater(data, file) 70 thread.start() 71 thread.join() 72 print '第', i, '份随机10000数据已写入。'
统计的代码:
1import urlparse, csv 2import threading 3from operator import itemgetter 4 5global DICTHOSTS 6DICTHOSTS = {} 7 8class TXTReader(threading.Thread): 9 def __init__(self, filename): 10 self._file = filename 11 threading.Thread.__init__(self) 12 13 def run(self): 14 global DICTHOSTS 15 try: 16 fhandle = open(self._file, 'r') 17 for line in fhandle: 18 host = urlparse.urlsplit(line.split(',')[1])[1] 19 if host <> '': 20 if DICTHOSTS.has_key(host): 21 DICTHOSTS[host] += 1 22 else: 23 DICTHOSTS[host] = 1 24 finally: 25 fhandle.close() 26 27 28if __name__ == '__main__': 29 sortedList = [] 30 reader = TXTReader('d:/CSV/titleurl1.csv') 31 reader.start() 32 reader.join() 33 print DICTHOSTS 34 35 sortedList = sorted(DICTHOSTS.items(), key = itemgetter(1), reverse = True) 36 37 writter = csv.writer(open('d:/CSV/Result.csv', 'wb')) 38 writter.writerows(sortedList) 39 40
|
|
|
| 日 | 一 | 二 | 三 | 四 | 五 | 六 |
---|
25 | 26 | 27 | 28 | 29 | 30 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 1 | 2 | 3 | 4 | 5 |
|
导航
统计
- 随笔: 42
- 文章: 0
- 评论: 12
- 引用: 0
常用链接
留言簿(1)
随笔分类(32)
随笔档案(42)
文章分类
最新随笔
最新评论
阅读排行榜
评论排行榜
|
|