import requests
from bs4 import BeautifulSoup
import re
import traceback
import sqlite3
import time
import sys
def ycl(word):
try:
url = "http://www.iciba.com/{}".format(word)
headers = { 'Host': 'www.iciba.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://www.baidu.com', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', }
response = requests.get(url = url,headers = headers)
soup = BeautifulSoup(response.text,"lxml")
#输出单词词性
cx = soup.find(class_='base-list switch_part')(class_='prop')
#输出词性词义
mp3 = soup.find_all(class_='new-speak-step')[1]
pattern = re.compile(r'http://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+.mp3')
mp3url = re.findall(pattern,mp3['ms-on-mouseover'])
mp3url = '.'.join(mp3url)
r = requests.get(mp3url)
#单词音频输出路径
dress = "E:\\sound\\"
mp3path = dress +word+".mp3"
with open(mp3path, 'wb') as f:
f.write(r.content)
#获取词性个数
meanings =soup.find_all(class_='prop')
#实行每个词性的词义同行输出
for i in range(len(meanings)):
s = soup.find(class_='base-list switch_part')('li')[i]('span')
c = cx[i].text
a = ''
for x in range(len(s)):
b = s[x].text
a = a + b
print(word)
print(c)
print(a)
# 存入数据库的方法
conn = sqlite3.connect("word.db")
cu = conn.cursor()
sql =cu.execute("INSERT INTO test (id,dc,cx,cy,mp3)VALUES(NULL,'%s','%s','%s','%s');"%(word,c,a,mp3path))
print(sql)
conn.commit()
print('\n')
except Exception as e:
print(e)
print("error")
with open("log.txt",'a') as f:
f.write(word+'\n')
def duqudanci(file):
wordcount = 0
for line in open(file):
word = line.strip('\n')
wordcount += 1
print(wordcount)
ycl(word)
if __name__ == '__main__':
conn = sqlite3.connect("word.db")
cu = conn.cursor()
word = ""
#需要爬取的单词
duqudanci(sys.argv[1])
print('下载完成')
conn.commit()
conn.close()
|