import json
import os
import sys, getopt
import demjson
from bs4 import BeautifulSoup
import urllib.request
import requests
def main(argv):
'''
please put image lot url
(for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
and directory as second arg
'''
directory = './scrapESO/'
imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
img_url_patterns = [
('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg')
('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
]
try:
opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
except getopt.GetoptError:
print('test.py -d <directory> -u <imagegroupurl>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('''test.py -d <directory> (default is : "./scrapESO/")
-u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
sys.exit()
elif opt in ("-d", "--dir"):
directory = arg
elif opt in ("-u", "--url"):
imgloturl = arg + "list/"
elif opt in ("-o", "--original"):
img_url_patterns = [
('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
if not os.path.exists(directory):
os.mkdir(directory)
jsonfile = directory + "allfiles.json"
json_all = open(jsonfile, "w")
i = 1
eso = imgloturl + str(i)
while requests.head(eso).status_code < 400:
page = requests.get(eso)
soup = BeautifulSoup(page.text, "html5lib")
start = 0
for scripts in soup.find_all('script'):
if "var images" in scripts.text:
scripttext = scripts.text
break
for scriptchar in scripttext:
if scriptchar == "[":
break
start += 1
listjs = scripttext[start:-2]
json_all.write(listjs + ",")
listdem = demjson.decode(listjs)
for j in listdem:
infosdict = {}
infosdict['image'] = {}
infosdict['object'] = {}
realurl = "https://www.eso.org" + j['url']
page = requests.get(realurl)
#print(realurl)
soup = BeautifulSoup(page.text, "html5lib")
infosimg = soup.find("div", class_="object-info").find_all("table")[0]
infosobj = soup.find("div", class_="object-info").find_all("table")[1]
infosdict['url'] = realurl
#print(realurl)
title = soup.find("h1").text
infosdict['image']['title'] = title
infosdict['image']['credit'] = soup.find("div", class_="credit").text
imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
if not imagesoup:
print("Image not found for " + realurl)
continue
#print(realurl)
#if imagesoup.isnotNone:
print(realurl)
descriptionps = imagesoup.find_all_next("p")
descriptiontext = ""
descriptionhtml = ""
for descriptionp in descriptionps:
if "credit" in str(descriptionp.previous_element):
break
descriptiontext_p = descriptionp.text.strip()
if descriptiontext_p:
descriptiontext += descriptiontext_p + "\n"
descriptionhtml_p = str(descriptionp).strip()
if descriptionhtml_p != "<p></p>":
descriptionhtml += descriptionhtml_p
infosdict['image']['description_text'] = descriptiontext
infosdict['image']['description_html'] = descriptionhtml
for tableimg in infosimg.find_all("tr"):
if "Id" in tableimg.text:
imgid = tableimg.find_all("td")[1].text
infosdict['image']['id'] = imgid
if "Type" in tableimg.text:
infosdict['image']['type'] = tableimg.find_all("td")[1].text
if "Date" in tableimg.text or "date" in tableimg.text:
infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
for tableobj in infosobj.find_all("tr"):
if "Name" in tableobj.text or "Nom" in tableobj.text:
infosdict['object']['name'] = tableobj.find_all("td")[1].text
if "Type" in tableobj.text:
infosdict['object']['type'] = tableobj.find_all("td")[1].text
imgurl = None
img = None
for imgurl_pattern, img_pattern in img_url_patterns:
imgurl_test = imgurl_pattern.format(imgid=imgid)
if requests.head(imgurl_test).status_code == 200:
imgurl = imgurl_test
img = img_pattern.format(imgid=imgid)
break
if imgurl is None:
continue
infosdict['image']['imgurl'] = imgurl
imgdirectory = directory + imgid
#if not os.path.exists(imgdirectory):
os.mkdir(imgdirectory)
imgdirection = imgdirectory + "/" + img
urllib.request.urlretrieve(imgurl, imgdirection)
jsonfname = imgid + ".json"
jsondirection = imgdirectory + "/" + jsonfname
json_img_file = open(jsondirection, "w")
json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
json_img_file.close()
print(realurl + " : OK")
i += 1
eso = imgloturl + str(i)
json_all.close()
if __name__ == '__main__':
main(sys.argv[1:])