3
|
1 |
import json |
|
2 |
import os |
|
3 |
import sys, getopt |
|
4 |
import demjson |
|
5 |
from bs4 import BeautifulSoup |
|
6 |
import urllib.request |
|
7 |
import requests |
|
8 |
|
|
9 |
def main(argv): |
|
10 |
''' |
|
11 |
please put image lot url |
|
12 |
(for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg |
|
13 |
and directory as second arg |
|
14 |
''' |
|
15 |
directory = './scrapESO/' |
|
16 |
imgloturl = 'https://www.eso.org/public/france/images/viewall/list/' |
|
17 |
img_url_patterns = [ |
|
18 |
('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg') |
|
19 |
('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg') |
|
20 |
] |
|
21 |
try: |
|
22 |
opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="]) |
|
23 |
except getopt.GetoptError: |
|
24 |
print('test.py -d <directory> -u <imagegroupurl>') |
|
25 |
sys.exit(2) |
|
26 |
for opt, arg in opts: |
|
27 |
if opt == '-h': |
|
28 |
print('''test.py -d <directory> (default is : "./scrapESO/") |
|
29 |
-u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''') |
|
30 |
sys.exit() |
|
31 |
elif opt in ("-d", "--dir"): |
|
32 |
directory = arg |
|
33 |
elif opt in ("-u", "--url"): |
|
34 |
imgloturl = arg + "list/" |
|
35 |
elif opt in ("-o", "--original"): |
|
36 |
img_url_patterns = [ |
|
37 |
('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'), |
|
38 |
('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'), |
|
39 |
('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')] |
|
40 |
|
|
41 |
if not os.path.exists(directory): |
|
42 |
os.mkdir(directory) |
|
43 |
jsonfile = directory + "allfiles.json" |
|
44 |
json_all = open(jsonfile, "w") |
|
45 |
i = 1 |
|
46 |
eso = imgloturl + str(i) |
|
47 |
while requests.head(eso).status_code < 400: |
|
48 |
page = requests.get(eso) |
|
49 |
soup = BeautifulSoup(page.text, "html5lib") |
|
50 |
start = 0 |
|
51 |
for scripts in soup.find_all('script'): |
|
52 |
if "var images" in scripts.text: |
|
53 |
scripttext = scripts.text |
|
54 |
break |
|
55 |
for scriptchar in scripttext: |
|
56 |
if scriptchar == "[": |
|
57 |
break |
|
58 |
start += 1 |
|
59 |
listjs = scripttext[start:-2] |
|
60 |
json_all.write(listjs + ",") |
|
61 |
listdem = demjson.decode(listjs) |
|
62 |
for j in listdem: |
|
63 |
infosdict = {} |
|
64 |
infosdict['image'] = {} |
|
65 |
infosdict['object'] = {} |
|
66 |
realurl = "https://www.eso.org" + j['url'] |
|
67 |
page = requests.get(realurl) |
|
68 |
#print(realurl) |
|
69 |
soup = BeautifulSoup(page.text, "html5lib") |
|
70 |
infosimg = soup.find("div", class_="object-info").find_all("table")[0] |
|
71 |
infosobj = soup.find("div", class_="object-info").find_all("table")[1] |
|
72 |
infosdict['url'] = realurl |
|
73 |
#print(realurl) |
|
74 |
title = soup.find("h1").text |
|
75 |
infosdict['image']['title'] = title |
|
76 |
infosdict['image']['credit'] = soup.find("div", class_="credit").text |
|
77 |
imagesoup = soup.find("div", class_="archive-image archive-image-dark popup") |
|
78 |
if not imagesoup: |
|
79 |
print("Image not found for " + realurl) |
|
80 |
continue |
|
81 |
#print(realurl) |
|
82 |
#if imagesoup.isnotNone: |
|
83 |
print(realurl) |
|
84 |
descriptionps = imagesoup.find_all_next("p") |
|
85 |
descriptiontext = "" |
|
86 |
descriptionhtml = "" |
|
87 |
for descriptionp in descriptionps: |
|
88 |
if "credit" in str(descriptionp.previous_element): |
|
89 |
break |
|
90 |
descriptiontext_p = descriptionp.text.strip() |
|
91 |
if descriptiontext_p: |
|
92 |
descriptiontext += descriptiontext_p + "\n" |
|
93 |
|
|
94 |
descriptionhtml_p = str(descriptionp).strip() |
|
95 |
if descriptionhtml_p != "<p></p>": |
|
96 |
descriptionhtml += descriptionhtml_p |
|
97 |
infosdict['image']['description_text'] = descriptiontext |
|
98 |
infosdict['image']['description_html'] = descriptionhtml |
|
99 |
for tableimg in infosimg.find_all("tr"): |
|
100 |
if "Id" in tableimg.text: |
|
101 |
imgid = tableimg.find_all("td")[1].text |
|
102 |
infosdict['image']['id'] = imgid |
|
103 |
if "Type" in tableimg.text: |
|
104 |
infosdict['image']['type'] = tableimg.find_all("td")[1].text |
|
105 |
if "Date" in tableimg.text or "date" in tableimg.text: |
|
106 |
infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text |
|
107 |
for tableobj in infosobj.find_all("tr"): |
|
108 |
if "Name" in tableobj.text or "Nom" in tableobj.text: |
|
109 |
infosdict['object']['name'] = tableobj.find_all("td")[1].text |
|
110 |
if "Type" in tableobj.text: |
|
111 |
infosdict['object']['type'] = tableobj.find_all("td")[1].text |
|
112 |
imgurl = None |
|
113 |
img = None |
|
114 |
for imgurl_pattern, img_pattern in img_url_patterns: |
|
115 |
imgurl_test = imgurl_pattern.format(imgid=imgid) |
|
116 |
if requests.head(imgurl_test).status_code == 200: |
|
117 |
imgurl = imgurl_test |
|
118 |
img = img_pattern.format(imgid=imgid) |
|
119 |
break |
|
120 |
if imgurl is None: |
|
121 |
continue |
|
122 |
|
|
123 |
infosdict['image']['imgurl'] = imgurl |
|
124 |
imgdirectory = directory + imgid |
|
125 |
#if not os.path.exists(imgdirectory): |
|
126 |
os.mkdir(imgdirectory) |
|
127 |
imgdirection = imgdirectory + "/" + img |
|
128 |
urllib.request.urlretrieve(imgurl, imgdirection) |
|
129 |
jsonfname = imgid + ".json" |
|
130 |
jsondirection = imgdirectory + "/" + jsonfname |
|
131 |
json_img_file = open(jsondirection, "w") |
|
132 |
json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': '))) |
|
133 |
json_img_file.close() |
|
134 |
print(realurl + " : OK") |
|
135 |
i += 1 |
|
136 |
eso = imgloturl + str(i) |
|
137 |
json_all.close() |
|
138 |
|
|
139 |
|
|
140 |
if __name__ == '__main__': |
|
141 |
main(sys.argv[1:]) |