利用 Python 程式協助查英文單字

查單字程式碼:

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import urllib.request
from bs4 import BeautifulSoup
import sys, codecs
# 將系統輸出語系編碼設為 utf8
sys.stdout = codecs.getwriter("utf8")(sys.stdout.detach())
# 表示要讀入的文章檔名為 wed.txt
filename = "wed"

"""
 Project: Concordancer Jr.
 File name: concordance.py
 Description:  Counts up the number of each unique word in a block of plain text.

 Copyright (C) 2010 Steve Osborne, srosborne (at) gmail.com
 http://yakinikuman.wordpress.com/
 *******
 This program is free software; you can redistribute it and/or modify 
 it under the terms of the GNU General Public License as published by 
 the Free Software Foundation; either version 2 of the License, or 
 (at your option) any later version.

 This program is distributed in the hope that it will be useful, but 
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 
 for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.

 *******
 Version history:
 1.0   Oct 27 2010
 2.0      Nov 11, 2010 - put into a class.  Can now be called with any block of text.

"""

class Concordancer:
    def __init__(self):
        #from http://www.duboislc.org/EducationWatch/First100Words.html
        #some common words a little fishy... "water"? "oil"??  But no "am"???
        self.common = ['the','of','and','a','to','in','is','you','that','it',
            'he','was','for','on','are','as','with','his','they','i',
            'at','be','this','have','from','or','one','had','by','word',
            'but','not','what','all','were','we','when','your','can','said',
            'there','use','an','each','which','she','do','how','their','if',
            'will','up','other','about','out','many','then','them','these','so',
            'some','her','would','make','like','him','into','time','has','look',
            'two','more','write','go','see','number','no','way','could','people',
            'my','than','first','water','been','call','who','oil','its','now',
            'find','long','down','day','did','get','come','made','may','part']

        self.wordIndex = dict() #will be a count of each word in the input text
        self.total = 0 #total words
        self.unique = 0 #unique words

    def getCommon(self):
        return self.common

    def updateCommon(self,newCommon):
        #newCommon is a list of words to be used on future calls to topWordsNotCommon
        self.common = newCommon

    def extendCommon(self,newCommon):
        #newCommon is a list of words to be added to self.common
        self.common.extend(newCommon)

    def populateIndex(self,data):
        #data is a block of text
        #splits up data and adds each word to the index
        #repeated calls to populateIndex will NOT clear the index - will just keep adding up words in new block of text
        for word in data.split():#splits at and removes whitespace
            self.addword(word)
        self.calculateUniqueWords()
        self.calculateTotalWords()

    def addword(self,word):
        word = word.translate("0123456789.!?,;:*\)\(\[\]\\\n/'\"")#remove punctuation, numbers, and newlines

        if len(word) > 5 and word.isalpha():
            word = word.lower()#convert to lower case
            #special case of dashes "--": separate into two words
            if "--" in word:
                words = word.replace('--',' ')
                for w in words.split():
                    self.addword(w)
            elif word in self.wordIndex:
                self.wordIndex[word] = self.wordIndex[word] + 1
            else:
                self.wordIndex[word] = 1

    def getSortedIndex(self):
        #note - sorted returns a list of tuples, not a dictionary
        s1 = sorted(list(self.wordIndex.items()),key=lambda item:item[0]) #secondary key: sort alphabetically
        s2 = sorted(s1,key=lambda item:item[1], reverse=True) #primary key: sort by count
        return s2

    def calculateUniqueWords(self):
        self.unique = len(self.wordIndex)

    def calculateTotalWords(self):
        total = 0
        for word in list(self.wordIndex.keys()):
            total = total + self.wordIndex[word]
        self.total = total

    def topWords(self,n,fExcludeCommon=1):
        #run only after "populateIndex" for meaningful output
        #returns list of top min(n,unique) words in the index
        #fExcludeCommon: if 1 [default], excludes any words in self.common.  Set to 0 to include all words.
        #returns list of (word,count) pairs for the top n words.  'count' is the count of that word.
        s2 = self.getSortedIndex()
        lwords = []
        i = 0
        while len(lwords) < n and i < self.unique:
            key = s2[i][0]
            value = s2[i][1]
            item = (key,value)
            if fExcludeCommon:
                if key not in self.common:
                    lwords.append(item)
            else:
                lwords.append(item)
            i = i + 1
        return lwords

# 以上為統計文章中單字出現次數的類別

def chk_dict(單字):
    url = 'http://dictionary.sina.com.tw/word/ec/'
    response = urllib.request.urlopen(url+單字)
    text = response.read()
    try:
        html_doc = text.decode("utf-8")
    except:
        html_doc = text.decode("latin-1")
    return html_doc

def parse_doc(網頁超文件):
    all_text = ""
    soup = BeautifulSoup(網頁超文件)
    div_tag = soup.findAll('div',{'class',"word_text1"})
    for i in range(len(div_tag)):
        all_text += div_tag[i].get_text()
    return all_text+"\n"

# 用來統計單字次序的全域變數
word_count = 0

def 查單字(單字):
    global word_count
    html_doc = chk_dict(單字)
    our_text = parse_doc(html_doc)
    if our_text == "\n":
        word_def = "查不到與 "+單字+" 有關的資料\n"
    else:
        word_count += 1
        word_def = str(word_count)+". "+單字+":\n"
        word_def += our_text
    word_def += "_"*50+"\n"
    return word_def

# 檔案是要將結果存檔用的 handle
檔案 = open("words_"+filename+".txt", "w", encoding="UTF-8")

# 以下為統計文章單字用的程式呼叫
concord = Concordancer()
file = filename+'.txt'
f = open(file,'rt',encoding="utf-8")
data = f.read()#the whole file as one big string
concord.populateIndex(data)

n = concord.unique
print("Top %s words:" % n)
top = concord.topWords(n)

# 隨後的 key 就是單字
order = 0
all_text = ""
for item in top:
    order += 1
    key = item[0]
    value = item[1]
    #print(order,"%s:%s" % (key,value))
    print(order,key,value)
    all_text += 查單字(key)

檔案.write(all_text)
print("done")

Comments

comments powered by Disqus