diff -r d3fe1866eb5b -r 16fb4f5efa69 data/script/scrapeso.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data/script/scrapeso.py Thu Jun 28 15:19:43 2018 +0200 @@ -0,0 +1,141 @@ +import json +import os +import sys, getopt +import demjson +from bs4 import BeautifulSoup +import urllib.request +import requests + +def main(argv): + ''' + please put image lot url + (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg + and directory as second arg + ''' + directory = './scrapESO/' + imgloturl = 'https://www.eso.org/public/france/images/viewall/list/' + img_url_patterns = [ + ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg') + ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg') + ] + try: + opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="]) + except getopt.GetoptError: + print('test.py -d -u ') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print('''test.py -d (default is : "./scrapESO/") + -u (default is : "https://www.eso.org/public/france/images/viewall/list/"''') + sys.exit() + elif opt in ("-d", "--dir"): + directory = arg + elif opt in ("-u", "--url"): + imgloturl = arg + "list/" + elif opt in ("-o", "--original"): + img_url_patterns = [ + ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'), + ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'), + ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')] + + if not os.path.exists(directory): + os.mkdir(directory) + jsonfile = directory + "allfiles.json" + json_all = open(jsonfile, "w") + i = 1 + eso = imgloturl + str(i) + while requests.head(eso).status_code < 400: + page = requests.get(eso) + soup = BeautifulSoup(page.text, "html5lib") + start = 0 + for scripts in soup.find_all('script'): + if "var images" in scripts.text: + scripttext = scripts.text + break + for scriptchar in scripttext: + if scriptchar == "[": + break + start += 1 + listjs = scripttext[start:-2] + json_all.write(listjs + ",") + listdem = demjson.decode(listjs) + for j in listdem: + infosdict = {} + infosdict['image'] = {} + infosdict['object'] = {} + realurl = "https://www.eso.org" + j['url'] + page = requests.get(realurl) + #print(realurl) + soup = BeautifulSoup(page.text, "html5lib") + infosimg = soup.find("div", class_="object-info").find_all("table")[0] + infosobj = soup.find("div", class_="object-info").find_all("table")[1] + infosdict['url'] = realurl + #print(realurl) + title = soup.find("h1").text + infosdict['image']['title'] = title + infosdict['image']['credit'] = soup.find("div", class_="credit").text + imagesoup = soup.find("div", class_="archive-image archive-image-dark popup") + if not imagesoup: + print("Image not found for " + realurl) + continue + #print(realurl) + #if imagesoup.isnotNone: + print(realurl) + descriptionps = imagesoup.find_all_next("p") + descriptiontext = "" + descriptionhtml = "" + for descriptionp in descriptionps: + if "credit" in str(descriptionp.previous_element): + break + descriptiontext_p = descriptionp.text.strip() + if descriptiontext_p: + descriptiontext += descriptiontext_p + "\n" + + descriptionhtml_p = str(descriptionp).strip() + if descriptionhtml_p != "

": + descriptionhtml += descriptionhtml_p + infosdict['image']['description_text'] = descriptiontext + infosdict['image']['description_html'] = descriptionhtml + for tableimg in infosimg.find_all("tr"): + if "Id" in tableimg.text: + imgid = tableimg.find_all("td")[1].text + infosdict['image']['id'] = imgid + if "Type" in tableimg.text: + infosdict['image']['type'] = tableimg.find_all("td")[1].text + if "Date" in tableimg.text or "date" in tableimg.text: + infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text + for tableobj in infosobj.find_all("tr"): + if "Name" in tableobj.text or "Nom" in tableobj.text: + infosdict['object']['name'] = tableobj.find_all("td")[1].text + if "Type" in tableobj.text: + infosdict['object']['type'] = tableobj.find_all("td")[1].text + imgurl = None + img = None + for imgurl_pattern, img_pattern in img_url_patterns: + imgurl_test = imgurl_pattern.format(imgid=imgid) + if requests.head(imgurl_test).status_code == 200: + imgurl = imgurl_test + img = img_pattern.format(imgid=imgid) + break + if imgurl is None: + continue + + infosdict['image']['imgurl'] = imgurl + imgdirectory = directory + imgid + #if not os.path.exists(imgdirectory): + os.mkdir(imgdirectory) + imgdirection = imgdirectory + "/" + img + urllib.request.urlretrieve(imgurl, imgdirection) + jsonfname = imgid + ".json" + jsondirection = imgdirectory + "/" + jsonfname + json_img_file = open(jsondirection, "w") + json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': '))) + json_img_file.close() + print(realurl + " : OK") + i += 1 + eso = imgloturl + str(i) + json_all.close() + + +if __name__ == '__main__': + main(sys.argv[1:])