1. [代码][Python]代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
| import requests from bs4 import BeautifulSoup import os s = requests.session() #下载图片 def saveImg(imageURL,fileName): try : #下载图片设置超时,建立链接3s,下载10s data = requests.get(imageURL,timeout = ( 3 , 10 )) except : return True f = open (fileName, "wb" ) f.write(data.content) f.close() #建立目录存储每个帖子的图片 def mkdir(path): path = path.strip() isExist = os.path.exists(path) if not isExist: os.makedirs(path) return True else : return False #创建目录存储所有帖子 mkdir( "img" ) os.chdir( "img" ) #模拟登录,请使用正确的帐户和密码 login_data = { 'fastloginfield' : 'username' , 'username' : '用户名' , "password" : "密码" , "quickforward" : "yes" , "handlekey" : "ls" } r = s.post( 'http://hkbici.com/member.php?mod=logging&action=login&loginsubmit=yes&infloat=yes&lssubmit=yes' , login_data) #爬第一页的所有帖子,如果爬第二页修改网址forum-18-2 r = s.get( 'http://hkbici.com/forum-18-1.html' ) soup = BeautifulSoup(r.text) href = [] for i in soup.find_all( "div" , class_ = "c cl" ): href.append(i.find( "a" ).get( "href" )) dirCount = 0 for i in href: url = "http://hkbici.com/" + i.strip() print (url) sp = BeautifulSoup(s.get(url).text) dirCount + = 1 name = str (dirCount) + sp.find( "meta" ,{ "name" : "keywords" }).get( "content" ).strip() mkdir(name) count = 0 for m in sp.find_all( "ignore_js_op" ): count + = 1 if m.find( "img" ).get( "file" ): imageURL = "http://hkbici.com/" + m.find( "img" ).get( "file" ).strip() fileName = name + "/" + str (count) + ".jpg" saveImg(imageURL,fileName) |
没有评论:
发表评论