-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscrape.py
More file actions
51 lines (41 loc) · 1.41 KB
/
scrape.py
File metadata and controls
51 lines (41 loc) · 1.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
import urllib.request
def makesoup(url):
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
return soup
def dl_img(url,file_name):
full_path = file_name + '.png'
urllib.request.urlretrieve(url,full_path)
#designed to take only one author as input
filename=input("Enter input text file : ")
file1 = open(filename,"r")
stri = file1.read()
str1 = stri.split()
print(str1)
start_year=int(str1[1])
start_month=int(str1[0])
end_year=int(str1[3])
end_month=int(str1[2])
i=1
while start_year<=end_year:
year = start_year
month = start_month
author=str1[4]
url="http://explosm.net/comics/archive/" + str(year) +"/" + str(month) +"/" +str(author)
soup = makesoup(url)
table = soup.find('div', attrs={'class': 'small-7 medium-8 large-8 columns'})
for row in table.findAll('div', attrs={'class': 'small-3 medium-3 large-3 columns'}):
new_url = row.a['href']
new_url = "http://explosm.net" + new_url
new_soup = makesoup(new_url)
img_src = new_soup.find('img',attrs={'id': 'main-comic',}).get('src')
img_src = "http:"+img_src
dl_img(img_src, str(i)) #images are saved as numbers(1,2,...) in the home directory
i=i+1
start_month = start_month+1
if start_year == end_year and start_month==end_year:
break
if start_month == 12:
start_year = start_year + 1