|
1 import json |
|
2 import os |
|
3 import sys, getopt |
|
4 import demjson |
|
5 from bs4 import BeautifulSoup |
|
6 import urllib.request |
|
7 import requests |
|
8 |
|
9 def main(argv): |
|
10 ''' |
|
11 please put image lot url |
|
12 (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg |
|
13 and directory as second arg |
|
14 ''' |
|
15 directory = './scrapESO/' |
|
16 imgloturl = 'https://www.eso.org/public/france/images/viewall/list/' |
|
17 img_url_patterns = [ |
|
18 ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg') |
|
19 ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg') |
|
20 ] |
|
21 try: |
|
22 opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="]) |
|
23 except getopt.GetoptError: |
|
24 print('test.py -d <directory> -u <imagegroupurl>') |
|
25 sys.exit(2) |
|
26 for opt, arg in opts: |
|
27 if opt == '-h': |
|
28 print('''test.py -d <directory> (default is : "./scrapESO/") |
|
29 -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''') |
|
30 sys.exit() |
|
31 elif opt in ("-d", "--dir"): |
|
32 directory = arg |
|
33 elif opt in ("-u", "--url"): |
|
34 imgloturl = arg + "list/" |
|
35 elif opt in ("-o", "--original"): |
|
36 img_url_patterns = [ |
|
37 ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'), |
|
38 ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'), |
|
39 ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')] |
|
40 |
|
41 if not os.path.exists(directory): |
|
42 os.mkdir(directory) |
|
43 jsonfile = directory + "allfiles.json" |
|
44 json_all = open(jsonfile, "w") |
|
45 i = 1 |
|
46 eso = imgloturl + str(i) |
|
47 while requests.head(eso).status_code < 400: |
|
48 page = requests.get(eso) |
|
49 soup = BeautifulSoup(page.text, "html5lib") |
|
50 start = 0 |
|
51 for scripts in soup.find_all('script'): |
|
52 if "var images" in scripts.text: |
|
53 scripttext = scripts.text |
|
54 break |
|
55 for scriptchar in scripttext: |
|
56 if scriptchar == "[": |
|
57 break |
|
58 start += 1 |
|
59 listjs = scripttext[start:-2] |
|
60 json_all.write(listjs + ",") |
|
61 listdem = demjson.decode(listjs) |
|
62 for j in listdem: |
|
63 infosdict = {} |
|
64 infosdict['image'] = {} |
|
65 infosdict['object'] = {} |
|
66 realurl = "https://www.eso.org" + j['url'] |
|
67 page = requests.get(realurl) |
|
68 #print(realurl) |
|
69 soup = BeautifulSoup(page.text, "html5lib") |
|
70 infosimg = soup.find("div", class_="object-info").find_all("table")[0] |
|
71 infosobj = soup.find("div", class_="object-info").find_all("table")[1] |
|
72 infosdict['url'] = realurl |
|
73 #print(realurl) |
|
74 title = soup.find("h1").text |
|
75 infosdict['image']['title'] = title |
|
76 infosdict['image']['credit'] = soup.find("div", class_="credit").text |
|
77 imagesoup = soup.find("div", class_="archive-image archive-image-dark popup") |
|
78 if not imagesoup: |
|
79 print("Image not found for " + realurl) |
|
80 continue |
|
81 #print(realurl) |
|
82 #if imagesoup.isnotNone: |
|
83 print(realurl) |
|
84 descriptionps = imagesoup.find_all_next("p") |
|
85 descriptiontext = "" |
|
86 descriptionhtml = "" |
|
87 for descriptionp in descriptionps: |
|
88 if "credit" in str(descriptionp.previous_element): |
|
89 break |
|
90 descriptiontext_p = descriptionp.text.strip() |
|
91 if descriptiontext_p: |
|
92 descriptiontext += descriptiontext_p + "\n" |
|
93 |
|
94 descriptionhtml_p = str(descriptionp).strip() |
|
95 if descriptionhtml_p != "<p></p>": |
|
96 descriptionhtml += descriptionhtml_p |
|
97 infosdict['image']['description_text'] = descriptiontext |
|
98 infosdict['image']['description_html'] = descriptionhtml |
|
99 for tableimg in infosimg.find_all("tr"): |
|
100 if "Id" in tableimg.text: |
|
101 imgid = tableimg.find_all("td")[1].text |
|
102 infosdict['image']['id'] = imgid |
|
103 if "Type" in tableimg.text: |
|
104 infosdict['image']['type'] = tableimg.find_all("td")[1].text |
|
105 if "Date" in tableimg.text or "date" in tableimg.text: |
|
106 infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text |
|
107 for tableobj in infosobj.find_all("tr"): |
|
108 if "Name" in tableobj.text or "Nom" in tableobj.text: |
|
109 infosdict['object']['name'] = tableobj.find_all("td")[1].text |
|
110 if "Type" in tableobj.text: |
|
111 infosdict['object']['type'] = tableobj.find_all("td")[1].text |
|
112 imgurl = None |
|
113 img = None |
|
114 for imgurl_pattern, img_pattern in img_url_patterns: |
|
115 imgurl_test = imgurl_pattern.format(imgid=imgid) |
|
116 if requests.head(imgurl_test).status_code == 200: |
|
117 imgurl = imgurl_test |
|
118 img = img_pattern.format(imgid=imgid) |
|
119 break |
|
120 if imgurl is None: |
|
121 continue |
|
122 |
|
123 infosdict['image']['imgurl'] = imgurl |
|
124 imgdirectory = directory + imgid |
|
125 #if not os.path.exists(imgdirectory): |
|
126 os.mkdir(imgdirectory) |
|
127 imgdirection = imgdirectory + "/" + img |
|
128 urllib.request.urlretrieve(imgurl, imgdirection) |
|
129 jsonfname = imgid + ".json" |
|
130 jsondirection = imgdirectory + "/" + jsonfname |
|
131 json_img_file = open(jsondirection, "w") |
|
132 json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': '))) |
|
133 json_img_file.close() |
|
134 print(realurl + " : OK") |
|
135 i += 1 |
|
136 eso = imgloturl + str(i) |
|
137 json_all.close() |
|
138 |
|
139 |
|
140 if __name__ == '__main__': |
|
141 main(sys.argv[1:]) |