data/script/scrapeso.py
author ymh <ymh.work@gmail.com>
Wed, 01 Aug 2018 14:43:20 +0200
changeset 28 15f63c5dfe3f
parent 8 5459fff1356b
permissions -rw-r--r--
Correct path for source image in path, take the real one, not the expected one. increase version nb

import json
import os
import sys, getopt
import demjson
from bs4 import BeautifulSoup
import urllib.request
import requests

def main(argv):
    '''
    please put image lot url
    (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
    and directory as second arg
    '''
    directory = './scrapESO/'
    imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
    img_url_patterns = [
        ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg'),
        ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
    ]
    try:
        opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
    except getopt.GetoptError:
        print('test.py -d <directory> -u <imagegroupurl>')
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print('''test.py     -d <directory> (default is : "./scrapESO/")
            -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
            sys.exit()
        elif opt in ("-d", "--dir"):
            directory = arg
        elif opt in ("-u", "--url"):
            imgloturl = arg + "list/"
        elif opt in ("-o", "--original"):
            img_url_patterns = [
                ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
                ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
                ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]

    if not os.path.exists(directory):
        os.mkdir(directory)
    jsonfile = directory + "allfiles.json"
    json_all = open(jsonfile, "w")
    i = 1
    eso = imgloturl + str(i)
    while requests.head(eso).status_code < 400:
        page = requests.get(eso)
        soup = BeautifulSoup(page.text, "html5lib")
        start = 0
        for scripts in soup.find_all('script'):
            if "var images" in scripts.text:
                scripttext = scripts.text
                break
        for scriptchar in scripttext:
            if scriptchar == "[":
                break
            start += 1
        listjs = scripttext[start:-2]
        json_all.write(listjs + ",")
        listdem = demjson.decode(listjs)
        for j in listdem:
            infosdict = {}
            infosdict['image'] = {}
            infosdict['object'] = {}
            realurl = "https://www.eso.org" + j['url']

            tempo_imgid = realurl.strip('/').split('/')[-1]
            tempo_imgdirectory = directory + tempo_imgid
            if os.path.isdir(tempo_imgdirectory) and os.path.isfile(os.path.join(tempo_imgdirectory, tempo_imgid+".json")):
                print("Image alredy processed skipping %s" % realurl)
                continue

            page = requests.get(realurl)
            #print(realurl)
            soup = BeautifulSoup(page.text, "html5lib")
            infosimg = soup.find("div", class_="object-info").find_all("table")[0]
            infosobj = soup.find("div", class_="object-info").find_all("table")[1]
            infosdict['url'] = realurl
            #print(realurl)
            title = soup.find("h1").text
            infosdict['image']['title'] = title
            infosdict['image']['credit'] = soup.find("div", class_="credit").text
            imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
            if not imagesoup:
                print("Image not found for " + realurl)
                continue
            #print(realurl)
        #if imagesoup.isnotNone:
            print(realurl)
            descriptionps = imagesoup.find_all_next("p")
            descriptiontext = ""
            descriptionhtml = ""
            for descriptionp in descriptionps:
                if "credit" in str(descriptionp.previous_element):
                    break
                descriptiontext_p = descriptionp.text.strip()
                if descriptiontext_p:
                    descriptiontext += descriptiontext_p + "\n"

                descriptionhtml_p = str(descriptionp).strip()
                if descriptionhtml_p != "<p></p>":
                    descriptionhtml += descriptionhtml_p
            infosdict['image']['description_text'] = descriptiontext
            infosdict['image']['description_html'] = descriptionhtml
            for tableimg in infosimg.find_all("tr"):
                if "Id" in tableimg.text:
                    imgid = tableimg.find_all("td")[1].text
                    infosdict['image']['id'] = imgid
                if "Type" in tableimg.text:
                    infosdict['image']['type'] = tableimg.find_all("td")[1].text
                if "Date" in tableimg.text or "date" in tableimg.text:
                    infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
            for tableobj in infosobj.find_all("tr"):
                if "Name" in tableobj.text or "Nom" in tableobj.text:
                    infosdict['object']['name'] = tableobj.find_all("td")[1].text
                if "Type" in tableobj.text:
                    infosdict['object']['type'] = tableobj.find_all("td")[1].text
            imgurl = None
            img = None
            for imgurl_pattern, img_pattern in img_url_patterns:
                imgurl_test = imgurl_pattern.format(imgid=imgid)
                if requests.head(imgurl_test).status_code == 200:
                    imgurl = imgurl_test
                    img = img_pattern.format(imgid=imgid)
                    break
            if imgurl is None:
                continue

            infosdict['image']['imgurl'] = imgurl
            imgdirectory = directory + imgid
            if os.path.exists(imgdirectory):
                print("Problem processing %s, folder already exists with id %s " % (realurl, imgid))
                continue
            os.mkdir(imgdirectory)
            imgdirection = imgdirectory + "/" + img
            urllib.request.urlretrieve(imgurl, imgdirection)
            jsonfname = imgid + ".json"
            jsondirection = imgdirectory + "/" + jsonfname
            json_img_file = open(jsondirection, "w")
            json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
            json_img_file.close()
            print(realurl + " : OK")
        i += 1
        eso = imgloturl + str(i)
    json_all.close()


if __name__ == '__main__':
    main(sys.argv[1:])