Correct path for source image in path, take the real one, not the expected one. increase version nb
import json
import os
import sys, getopt
import demjson
from bs4 import BeautifulSoup
import urllib.request
import requests
def main(argv):
'''
please put image lot url
(for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
and directory as second arg
'''
directory = './scrapESO/'
imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
img_url_patterns = [
('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg'),
('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
]
try:
opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
except getopt.GetoptError:
print('test.py -d <directory> -u <imagegroupurl>')
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print('''test.py -d <directory> (default is : "./scrapESO/")
-u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
sys.exit()
elif opt in ("-d", "--dir"):
directory = arg
elif opt in ("-u", "--url"):
imgloturl = arg + "list/"
elif opt in ("-o", "--original"):
img_url_patterns = [
('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
if not os.path.exists(directory):
os.mkdir(directory)
jsonfile = directory + "allfiles.json"
json_all = open(jsonfile, "w")
i = 1
eso = imgloturl + str(i)
while requests.head(eso).status_code < 400:
page = requests.get(eso)
soup = BeautifulSoup(page.text, "html5lib")
start = 0
for scripts in soup.find_all('script'):
if "var images" in scripts.text:
scripttext = scripts.text
break
for scriptchar in scripttext:
if scriptchar == "[":
break
start += 1
listjs = scripttext[start:-2]
json_all.write(listjs + ",")
listdem = demjson.decode(listjs)
for j in listdem:
infosdict = {}
infosdict['image'] = {}
infosdict['object'] = {}
realurl = "https://www.eso.org" + j['url']
tempo_imgid = realurl.strip('/').split('/')[-1]
tempo_imgdirectory = directory + tempo_imgid
if os.path.isdir(tempo_imgdirectory) and os.path.isfile(os.path.join(tempo_imgdirectory, tempo_imgid+".json")):
print("Image alredy processed skipping %s" % realurl)
continue
page = requests.get(realurl)
#print(realurl)
soup = BeautifulSoup(page.text, "html5lib")
infosimg = soup.find("div", class_="object-info").find_all("table")[0]
infosobj = soup.find("div", class_="object-info").find_all("table")[1]
infosdict['url'] = realurl
#print(realurl)
title = soup.find("h1").text
infosdict['image']['title'] = title
infosdict['image']['credit'] = soup.find("div", class_="credit").text
imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
if not imagesoup:
print("Image not found for " + realurl)
continue
#print(realurl)
#if imagesoup.isnotNone:
print(realurl)
descriptionps = imagesoup.find_all_next("p")
descriptiontext = ""
descriptionhtml = ""
for descriptionp in descriptionps:
if "credit" in str(descriptionp.previous_element):
break
descriptiontext_p = descriptionp.text.strip()
if descriptiontext_p:
descriptiontext += descriptiontext_p + "\n"
descriptionhtml_p = str(descriptionp).strip()
if descriptionhtml_p != "<p></p>":
descriptionhtml += descriptionhtml_p
infosdict['image']['description_text'] = descriptiontext
infosdict['image']['description_html'] = descriptionhtml
for tableimg in infosimg.find_all("tr"):
if "Id" in tableimg.text:
imgid = tableimg.find_all("td")[1].text
infosdict['image']['id'] = imgid
if "Type" in tableimg.text:
infosdict['image']['type'] = tableimg.find_all("td")[1].text
if "Date" in tableimg.text or "date" in tableimg.text:
infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
for tableobj in infosobj.find_all("tr"):
if "Name" in tableobj.text or "Nom" in tableobj.text:
infosdict['object']['name'] = tableobj.find_all("td")[1].text
if "Type" in tableobj.text:
infosdict['object']['type'] = tableobj.find_all("td")[1].text
imgurl = None
img = None
for imgurl_pattern, img_pattern in img_url_patterns:
imgurl_test = imgurl_pattern.format(imgid=imgid)
if requests.head(imgurl_test).status_code == 200:
imgurl = imgurl_test
img = img_pattern.format(imgid=imgid)
break
if imgurl is None:
continue
infosdict['image']['imgurl'] = imgurl
imgdirectory = directory + imgid
if os.path.exists(imgdirectory):
print("Problem processing %s, folder already exists with id %s " % (realurl, imgid))
continue
os.mkdir(imgdirectory)
imgdirection = imgdirectory + "/" + img
urllib.request.urlretrieve(imgurl, imgdirection)
jsonfname = imgid + ".json"
jsondirection = imgdirectory + "/" + jsonfname
json_img_file = open(jsondirection, "w")
json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
json_img_file.close()
print(realurl + " : OK")
i += 1
eso = imgloturl + str(i)
json_all.close()
if __name__ == '__main__':
main(sys.argv[1:])