data/script/scrapeso.py
changeset 3 16fb4f5efa69
child 6 7e15a917d940
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/script/scrapeso.py	Thu Jun 28 15:19:43 2018 +0200
@@ -0,0 +1,141 @@
+import json
+import os
+import sys, getopt
+import demjson
+from bs4 import BeautifulSoup
+import urllib.request
+import requests
+
+def main(argv):
+    '''
+    please put image lot url
+    (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
+    and directory as second arg
+    '''
+    directory = './scrapESO/'
+    imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
+    img_url_patterns = [
+        ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg')
+        ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
+    ]
+    try:
+        opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
+    except getopt.GetoptError:
+        print('test.py -d <directory> -u <imagegroupurl>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print('''test.py     -d <directory> (default is : "./scrapESO/")
+            -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
+            sys.exit()
+        elif opt in ("-d", "--dir"):
+            directory = arg
+        elif opt in ("-u", "--url"):
+            imgloturl = arg + "list/"
+        elif opt in ("-o", "--original"):
+            img_url_patterns = [
+                ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
+                ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
+                ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
+
+    if not os.path.exists(directory):
+        os.mkdir(directory)
+    jsonfile = directory + "allfiles.json"
+    json_all = open(jsonfile, "w")
+    i = 1
+    eso = imgloturl + str(i)
+    while requests.head(eso).status_code < 400:
+        page = requests.get(eso)
+        soup = BeautifulSoup(page.text, "html5lib")
+        start = 0
+        for scripts in soup.find_all('script'):
+            if "var images" in scripts.text:
+                scripttext = scripts.text
+                break
+        for scriptchar in scripttext:
+            if scriptchar == "[":
+                break
+            start += 1
+        listjs = scripttext[start:-2]
+        json_all.write(listjs + ",")
+        listdem = demjson.decode(listjs)
+        for j in listdem:
+            infosdict = {}
+            infosdict['image'] = {}
+            infosdict['object'] = {}
+            realurl = "https://www.eso.org" + j['url']
+            page = requests.get(realurl)
+            #print(realurl)
+            soup = BeautifulSoup(page.text, "html5lib")
+            infosimg = soup.find("div", class_="object-info").find_all("table")[0]
+            infosobj = soup.find("div", class_="object-info").find_all("table")[1]
+            infosdict['url'] = realurl
+            #print(realurl)
+            title = soup.find("h1").text
+            infosdict['image']['title'] = title
+            infosdict['image']['credit'] = soup.find("div", class_="credit").text
+            imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
+            if not imagesoup:
+                print("Image not found for " + realurl)
+                continue
+            #print(realurl)
+        #if imagesoup.isnotNone:
+            print(realurl)
+            descriptionps = imagesoup.find_all_next("p")
+            descriptiontext = ""
+            descriptionhtml = ""
+            for descriptionp in descriptionps:
+                if "credit" in str(descriptionp.previous_element):
+                    break
+                descriptiontext_p = descriptionp.text.strip()
+                if descriptiontext_p:
+                    descriptiontext += descriptiontext_p + "\n"
+
+                descriptionhtml_p = str(descriptionp).strip()
+                if descriptionhtml_p != "<p></p>":
+                    descriptionhtml += descriptionhtml_p
+            infosdict['image']['description_text'] = descriptiontext
+            infosdict['image']['description_html'] = descriptionhtml
+            for tableimg in infosimg.find_all("tr"):
+                if "Id" in tableimg.text:
+                    imgid = tableimg.find_all("td")[1].text
+                    infosdict['image']['id'] = imgid
+                if "Type" in tableimg.text:
+                    infosdict['image']['type'] = tableimg.find_all("td")[1].text
+                if "Date" in tableimg.text or "date" in tableimg.text:
+                    infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
+            for tableobj in infosobj.find_all("tr"):
+                if "Name" in tableobj.text or "Nom" in tableobj.text:
+                    infosdict['object']['name'] = tableobj.find_all("td")[1].text
+                if "Type" in tableobj.text:
+                    infosdict['object']['type'] = tableobj.find_all("td")[1].text
+            imgurl = None
+            img = None
+            for imgurl_pattern, img_pattern in img_url_patterns:
+                imgurl_test = imgurl_pattern.format(imgid=imgid)
+                if requests.head(imgurl_test).status_code == 200:
+                    imgurl = imgurl_test
+                    img = img_pattern.format(imgid=imgid)
+                    break
+            if imgurl is None:
+                continue
+
+            infosdict['image']['imgurl'] = imgurl
+            imgdirectory = directory + imgid
+        #if not os.path.exists(imgdirectory):
+            os.mkdir(imgdirectory)
+            imgdirection = imgdirectory + "/" + img
+            urllib.request.urlretrieve(imgurl, imgdirection)
+            jsonfname = imgid + ".json"
+            jsondirection = imgdirectory + "/" + jsonfname
+            json_img_file = open(jsondirection, "w")
+            json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
+            json_img_file.close()
+            print(realurl + " : OK")
+        i += 1
+        eso = imgloturl + str(i)
+    json_all.close()
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])