简介
记录自己做的一些稀奇古怪的程序.
工具一: 查找单词解释
原理: http://dict.cn/ 提供了查词的引擎, 输入单词即可以得到解释. 所以这个程序只是做一些体力活. 代码如下:
(有朋友反映复制之后出错. 若如此请移步到这里下载这个脚本:
http://www.uudisc.com/user/diegoyun/file/4131948#!/usr/bin/python
# -*- coding: UTF-8 -*-
'''
DictFinder.py
@version: v1.0, updated at 2011-04-28
@author: yunshichen@gmail.com
@copyright: GPL
Description:
这个程序用于辅助英语学习. 学到一些新单词新句子之后, 我喜欢打印出来读/背.
目前实现的功能:
1>到 http://dict.cn 查找单词解释
2>将结果存到html文本.
运行例子: 见 test_it 方法.
'''
from xml.dom import minidom
import urllib
import string
import os
import sys
class DictCN:
DICT_CN_URL = 'http://dict.cn/ws.php?q=${word}'
def __init__(self,cache_dir):
self.correct_word_map = {}
self.wrong_word_map = {}
self.__init_cache(cache_dir)
def __init_cache(self,cache_dir):
cache_file = cache_dir+"/cache.txt"
xml_cache_dir = cache_dir + "/xml_cache"
self.cache_file = cache_file
self.cache_xml_dir = xml_cache_dir
self.cache_dir = cache_dir
self.cache_map = {}
if os.path.exists(cache_file)==False:
p,f = os.path.split(cache_file);
if(os.path.exists(p)!=True):
os.makedirs(p)
foo = open(cache_file,"w")
foo.close()
if os.path.exists(xml_cache_dir)==False:
os.makedirs(xml_cache_dir)
return
foo = open(cache_file,"r")
for line in foo:
# cache line would be: word1: filename1, word2:filename2, word3:filename3 ..
line_list = line.split(",")
for pair in line_list:
pair = pair.strip()
if len(pair)<1:
continue
pair_t = pair.split(":")
self.cache_map[pair_t[0]] = pair_t[1]
def __do_query_for_word(self,en_word):
en_word = en_word.replace("\n","").strip()
xml_cache_path = self.cache_xml_dir + "/" + en_word + ".xml"
if en_word in self.cache_map:
foo = open(xml_cache_path,"r" )
data = foo.read()
foo.close()
print "--> Found " + en_word + " at cache files"
else:
aurl = string.Template(self.DICT_CN_URL).substitute({'word':en_word})
# minidom can not parse content with GBK encoding, so must convert it.
data = urllib.urlopen(aurl).read().decode('gbk').encode('utf8')
# minidom can not parse with xml specification, so must remove it.
data = data.replace('<?xml version="1.0" encoding="GBK" ?>','')
self.update_cache(en_word, data, xml_cache_path)
print "--> Found " + en_word + " from website and update to cache."
xmldoc = minidom.parseString(data)
return xmldoc
def update_cache(self,en_word,data,xml_cache_path):
foo = open(xml_cache_path,"w")
foo.write(data+"\n")
foo.close()
pdir, fname = os.path.split(xml_cache_path)
foo = open(self.cache_file,"a")
foo.write(en_word+":"+fname.strip()+",")
foo.close()
self.cache_map[en_word] = fname
def do_query(self,word_strings):
'''
for example, girl,name,national
'''
wlist = word_strings.split(",")
for ww in wlist:
self.handle_result(ww,None)
def handle_result(self,ww,sen):
result = self.__do_query_for_word(ww)
if(self.is_word_corrent(result)):
self.correct_word_map[ww] = DictcnWord(ww,result,sen)
else:
self.wrong_word_map[ww] = result
def do_query_with_sentence(self,word_strings):
'''
for example: this is an example | girl: a girl is over there | .
'''
pairlist = word_strings.split("|")
for pair in pairlist:
if pair.strip()=="":
continue
pair = pair.split(":")
ww = pair[0].strip()
sen = None
if len(pair)==2:
sen = pair[1]
self.handle_result(ww,sen)
def do_query_from_text_file(self,fpath):
if not os.path.exists(fpath):
print "-->file: "+fpath + " does not exist."
sys.exit()
foo = open(fpath,"r")
pick_up_new_word = False
new_word_list = []
for line in foo:
line = line.strip()
if line=="" or line.startswith("##") or line.startswith("=="):
continue
# TODO: Just support new_word now.
if line.startswith("--new_word"):
pick_up_new_word = True
continue
if pick_up_new_word:
new_word_list.append(line)
foo.close()
for line in new_word_list:
self.do_query_with_sentence(line)
def is_word_corrent(self,result):
aa = result.getElementsByTagName("sugg")
if( aa == None or len(aa)==0 ):
return True
return False
class DictcnWord():
'''
A sample of dictcn query result:
<dict>
<key>national</key>
<lang>ec</lang>
<audio>http://mp3.dict.cn/mp3.php?q=0FBxW</audio>
<pron>'næʃənəl</pron>
<def>adj.民族的, 国家的, 国立的, 全国性的
n.国民
(复)nationals: 全国性比赛</def>
<sent><orig>I got my hearing aid on the <em>National</em> Health (Service).</orig><trans>我的助听器是国民保健署资助的。</trans></sent>
<sent><orig>A lot has been done in the recovery of <em>national</em> economy in the past few years.</orig><trans>在过去的几年中我们为了国民经济的恢复做了大量的工作。</trans></sent>
<sent><orig>The gross <em>national</em> product had increased 5 percent last year.</orig><trans>去年的国民生产总值提高了百分之五。</trans></sent>
</dict>
'''
def __init__(self,word,xmldoc,sen=None):
self.convert_from(word, xmldoc,sen);
def convert_from(self,word,xmldoc,add_sen=None):
self.word = word
self.audio = self.get_text_from_unique_element(xmldoc, "audio")
self.pron = self.get_text_from_unique_element(xmldoc, "pron")
self.cn_explain = self.get_text_from_unique_element(xmldoc, "def")
senlist = xmldoc.getElementsByTagName("sent")
alist = []
for sen in senlist:
en = self.get_text_from_unique_element(sen, "orig")
cn = self.get_text_from_unique_element(sen, "trans")
amap = {"en":en,"cn":cn}
alist.append(amap)
self.sentences = alist
if add_sen <> None:
self.sentences.append({"en":add_sen,"cn":""})
self.xml_string = xmldoc.toxml()
#print self.xml_string
def get_text_from_unique_element(self,xmldoc,tagName):
t = xmldoc.getElementsByTagName(tagName)
if t==None or len(t)==0 :
print "--> Tag "+tagName+" in "+self.word+" is null"
return ""
t = t[0]
return self.__get_text(t.childNodes)
def __get_text(self,nodelist):
rc = []
for node in nodelist:
if node.nodeType == node.TEXT_NODE:
rc.append(node.data)
return ''.join(rc)
class SimpleWordTemplate():
HTML_BEGIN = '<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8" /></head><body>'
HTML_END = "</body></html>"
SIMPLE_EN_TEXT = '''--------- <strong>${word}</strong> [ ${pron} ]
<ul>${sentence_str}</ul>'''
SIMPLE_CN_TEXT = '''------- ${cn_expl}
<ul>${sentence_str}</ul>'''
SIMPLE_ALL_TEXT = '''--------- <strong>${word}</strong> [ ${pron} ]
${cn_expl}
<ul>${sentence_str}</ul>'''
def decorate_en_text_from(self,dict_word):
str = ""
for ss in dict_word.sentences:
str = str + "<li>" + ss["en"] + "</li>"
shtml = string.Template(SimpleWordTemplate.SIMPLE_EN_TEXT).substitute({'word':dict_word.word,"pron":dict_word.pron,"sentence_str": str})
return shtml
def decorate_cn_text_from(self,dict_word):
str = ""
for ss in dict_word.sentences:
str = str + "<li>" + ss["cn"] + "</li>"
shtml = string.Template(SimpleWordTemplate.SIMPLE_CN_TEXT).substitute({'cn_expl':dict.cn_explain,"sentence_str": str})
return shtml
def decorate_all_text_from(self,dict_word):
str = ""
for ss in dict_word.sentences:
str = str + "<li>" + ss["en"] + "</li>"
shtml = string.Template(SimpleWordTemplate.SIMPLE_ALL_TEXT).substitute({'word':dict_word.word,'cn_expl':dict_word.cn_explain,"pron":dict_word.pron,"sentence_str": str})
return shtml
def print_en_version(self,word_map,bak_path=None):
en_text = ""
for key in word_map:
en_text = en_text + self.decorate_en_text_from(word_map[key])
en_text = SimpleWordTemplate.HTML_BEGIN + en_text + SimpleWordTemplate.HTML_END
# print en_text
self.__print_to_html__(en_text,"d:/download/my_word_en.html")
def print_both_en_cn(self,word_map,bak_path=None):
en_text = ""
for key in word_map:
en_text = en_text + self.decorate_all_text_from(word_map[key])
en_text = SimpleWordTemplate.HTML_BEGIN + en_text + SimpleWordTemplate.HTML_END
if bak_path == None:
bak_path = "d:/download/my_word_en.html"
self.__print_to_html__(en_text,bak_path)
def __print_to_html__(self,text,bak_path=None):
#TODO: will move to config file
if bak_path==None:
bak_path = "d:/download/my_word.html"
f = open(bak_path, "w" )
f.write(text.encode("utf-8"))
f.close()
print "\n--> Create file at: " + bak_path
def test_it():
cache_dir = "/media/dev/open_source/word_cache"
bak_path = "/media/install/download/en_word.html"
words_query = '''
profound: a profound book | magnitude: magnitude university
'''
obj = DictCN(cache_dir)
obj.do_query_with_sentence(words_query)
mm = obj.correct_word_map
template = SimpleWordTemplate()
template.print_both_en_cn(mm,bak_path)
def test_from_file():
cache_dir = "/media/dev/open_source/word_cache"
bak_path = "/media/install/download/en_word.html"
new_word_text = "/media/install/download/new_word.txt"
scontent = """
################################################################
## 这个程序用于辅助英语学习. 学到一些新单词新句子之后, 我喜欢打印出来读/背.
## 略读型文章的笔记:
## new_word: 格式为 单词: 句子 | 单词: 句子 | 单词: 句子 | . 这些单词会用程序自动从网站找到解释.
################################################################
==seven habits of highly effective people
--new_word:
profound : We have transitioned from the Industrial Age into the Information Worker Age - with all of its profound consequences. |
magnitude : These challenges are not only of a new order of magnitude, they are altogether different in kind. |
rumbling : These sweeping changes in society and rumbling shifts in the digitized global marketplace give rise to a very important question |
digitize : These sweeping changes in society and rumbling shifts in the digitized global marketplace give rise to a very important question |
"""
if not os.path.exists(new_word_text):
parent,fname = os.path.split(new_word_text)
if not os.path.exists(parent):
os.makedirs(parent)
foo = open(new_word_text,"w")
foo.write(scontent)
foo.close()
obj = DictCN(cache_dir)
obj.do_query_from_text_file(new_word_text)
mm = obj.correct_word_map
template = SimpleWordTemplate()
template.print_both_en_cn(mm,bak_path)
if __name__ =="__main__":
print "=========== Finding explanation for English new words: "
import time
begin = time.clock()
test_from_file()
print "==== Completed. Elapsed time: " + str(time.clock() - begin)