对于需要大量翻译的数据,人工翻译太慢,此时需要使用软件进行批量翻译。
1.使用360的翻译
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
def fanyi_word_cn(string): url = "https://fanyi.so.com/index/search" #db_path = './db/tasks.db' Form_Data = {} #这里输入要翻译的英文 Form_Data[ 'query' ] = string Form_Data[ 'eng' ] = '1' #用urlencode把字典变成字符串,#服务器不接受字典,只接受字符串和二进制 data = parse.urlencode(Form_Data).encode( 'utf-8' ) #改成服务器可识别的数据后,请求,获取回应数据 response = request.urlopen(url, data) html = response.read().decode( "utf-8" ) #解码方式 #java中的对象(集合)和数组(元素为集合),loads可转Python字典 result = json.loads(html) #字典调取键名data下的键名fanyi,获取其值 translate_result = result[ "data" ][ "fanyi" ] #print(translate_result) return translate_result |
2.使用Google自带的API来翻译
注意,需要安装API模块即可。
pip install translator
1
2
3
4
5
|
#google api, per 1000 words everyday def translate_cn_api(content): translator = Translator(to_lang = "zh" ) translation = translator.translate(content) return translation |
3.使用Google翻译来做,是由于Google提供的API有字节限制,每天只能翻译1000字。
备注:环境准备
3.1 java环境
3.2 安装execjs模块
1
|
pip install PyExecJS |
3.3 两个实现模块
HandleJs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#coding=utf-8 import execjs class Py4Js(): def __init__( self ): self .ctx = execjs. compile ( """ function TL(a) { var k = ""; var b = 406644; var b1 = 3293161072; var jd = "."; var $b = "+-a^+6"; var Zb = "+-3^+b+-f"; for (var e = [], f = 0, g = 0; g < a.length; g++) { var m = a.charCodeAt(g); 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023), e[f++] = m >> 18 | 240, e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224, e[f++] = m >> 6 & 63 | 128), e[f++] = m & 63 | 128) } a = b; for (f = 0; f < e.length; f++) a += e[f], a = RL(a, $b); a = RL(a, Zb); a ^= b1 || 0; 0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + jd + (a ^ b) }; function RL(a, b) { var t = "a"; var Yb = "+"; for (var c = 0; c < b.length - 2; c += 3) { var d = b.charAt(c + 2), d = d >= t ? d.charCodeAt(0) - 87 : Number(d), d = b.charAt(c + 1) == Yb ? a >>> d: a << d; a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d } return a } """ ) def getTk( self ,text): return self .ctx.call( "TL" ,text) |
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
#coding=utf-8 #import urllib.request import urllib2 from HandleJs import Py4Js from translate import Translator import requests # Example: find_last('aaaa', 'a') returns 3 # Make sure your procedure has a return statement. def find_last(string, str ): last_position = - 1 while True : position = string.find( str ,last_position + 1 ) if position = = - 1 : return last_position last_position = position def open_url(url): headers = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib2.Request(url = url,headers = headers) response = urllib2.urlopen(req) data = response.read().decode( 'utf-8' ) return data def translate_core(content,tk, language): if len (content) > 4891 : print ( "too long byte >4891" ) return content = urllib2.quote(content) if language = = 'de' : url = "http://translate.google.cn/translate_a/single?client=t" + "&sl=de&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" + "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" + "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk,content) else : url = "http://translate.google.cn/translate_a/single?client=t" + "&sl=en&tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca" + "&dt=rw&dt=rm&dt=ss&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1" + "&srcrom=0&ssel=0&tsel=0&kc=2&tk=%s&q=%s" % (tk,content) #result为json格式 result = open_url(url) #print('results:' + result) if len (content) < 10 : end = result.find( "\"," ) if end > 4 : return result[ 4 :end] else : result_all = '' if language = = 'de' : result_all = result.split( ',null,"de",null,null,' )[ 0 ].replace( '[[' , ' ').replace(' ]] ', ' ]')[ 1 :] else : result_all = result.split( ',null,"en",null,null,' )[ 0 ].replace( '[[' , ' ').replace(' ]] ', ' ]')[ 1 :] #print('result_all:' + result_all) output_cn = '' #解析中文字段并拼接 list = result_all.split( '],[' ) for i in range ( len ( list ) - 1 ): end = list [i].find( "\"," ) tmp_buf = list [i][ 1 :end] output_cn = output_cn + tmp_buf return output_cn def translate_normal(content, language): js = Py4Js() tk = js.getTk(content) #print('english:' + content) cn_buf = translate_core(content,tk, language) #print('Chinese:' + cn_buf) return cn_buf def translate_cn(content, language): LEN_LIMIT = 4891 all_len = len (content) print ( 'en:' + content) if all_len > LEN_LIMIT: content_cn = '' while True : content_limit = content[ 0 :LEN_LIMIT] limit_end = find_last(content_limit, '.' ) + 1 #print('limit_end:' + str(limit_end)) if limit_end = = 0 : limit_end = find_last(content_limit, ' ' ) + 1 if limit_end = = 0 : limit_end = LEN_LIMIT content_en = content[ 0 :limit_end] leave_len = all_len - limit_end if content_en = = '': break ; #print('content_en:' + content_en) content_cn = content_cn + translate_normal(content_en, language); content = content[limit_end:] return content_cn else : return translate_normal(content, language) #google api, per 1000 words everyday def translate_cn_api(content): translator = Translator(to_lang = "zh" ) translation = translator.translate(content) return translation if __name__ = = "__main__" : content = """Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Namespaces are one honking great idea -- let's do more of those!""" # content = """ IT-Grundschutz M5.131: Absicherung von IP-Protokollen unter Windows Server 2003.""" #content = 'High' content = """Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Namespaces are one honking great idea -- let's do more of those!""" language = 'en' test = translate_cn(content.replace( '\n' , ''), language) print ( 'ok:' + test) #content = 'Checks version' |
此处实现了德语翻译成中文和英文翻译成中文。