* add scraping script
authorymh <ymh.work@gmail.com>
Thu, 28 Jun 2018 15:19:43 +0200
changeset 3 16fb4f5efa69
parent 2 d3fe1866eb5b
child 4 1ffa3a81bdd9
* add scraping script * correct size management problem on importimages
.hgignore
data/eso_collection.json
data/script/requirements.txt
data/script/scrapeso.py
src/iconolab_episteme/management/commands/importimages.py
src/iconolab_episteme/settings/__init__.py
--- a/.hgignore	Thu Jun 28 15:15:39 2018 +0200
+++ b/.hgignore	Thu Jun 28 15:19:43 2018 +0200
@@ -43,3 +43,8 @@
 ^sbin/sync/.vscode
 ^sbin/sync/.vagrant
 ^sbin/sync/fabric.py$
+
+^data/script/.direnv
+^data/script/.envrc$
+^data/script/scrapESO
+
--- a/data/eso_collection.json	Thu Jun 28 15:15:39 2018 +0200
+++ b/data/eso_collection.json	Thu Jun 28 15:19:43 2018 +0200
@@ -2,5 +2,5 @@
 	"name": "eso",
 	"verbose_name": "European Southern Observatory",
 	"description": "L’ESO construit et gère les télescopes astronomiques au sol les plus puissants au monde qui permettent d’importantes découvertes scientifiques.",
-	"image": "foo.jpg"
+	"image": "eso_logo.jpg"
 }
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/script/requirements.txt	Thu Jun 28 15:19:43 2018 +0200
@@ -0,0 +1,4 @@
+beautifulsoup4
+requests
+demjson
+html5lib
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/data/script/scrapeso.py	Thu Jun 28 15:19:43 2018 +0200
@@ -0,0 +1,141 @@
+import json
+import os
+import sys, getopt
+import demjson
+from bs4 import BeautifulSoup
+import urllib.request
+import requests
+
+def main(argv):
+    '''
+    please put image lot url
+    (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
+    and directory as second arg
+    '''
+    directory = './scrapESO/'
+    imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
+    img_url_patterns = [
+        ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg')
+        ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
+    ]
+    try:
+        opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
+    except getopt.GetoptError:
+        print('test.py -d <directory> -u <imagegroupurl>')
+        sys.exit(2)
+    for opt, arg in opts:
+        if opt == '-h':
+            print('''test.py     -d <directory> (default is : "./scrapESO/")
+            -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
+            sys.exit()
+        elif opt in ("-d", "--dir"):
+            directory = arg
+        elif opt in ("-u", "--url"):
+            imgloturl = arg + "list/"
+        elif opt in ("-o", "--original"):
+            img_url_patterns = [
+                ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
+                ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
+                ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
+
+    if not os.path.exists(directory):
+        os.mkdir(directory)
+    jsonfile = directory + "allfiles.json"
+    json_all = open(jsonfile, "w")
+    i = 1
+    eso = imgloturl + str(i)
+    while requests.head(eso).status_code < 400:
+        page = requests.get(eso)
+        soup = BeautifulSoup(page.text, "html5lib")
+        start = 0
+        for scripts in soup.find_all('script'):
+            if "var images" in scripts.text:
+                scripttext = scripts.text
+                break
+        for scriptchar in scripttext:
+            if scriptchar == "[":
+                break
+            start += 1
+        listjs = scripttext[start:-2]
+        json_all.write(listjs + ",")
+        listdem = demjson.decode(listjs)
+        for j in listdem:
+            infosdict = {}
+            infosdict['image'] = {}
+            infosdict['object'] = {}
+            realurl = "https://www.eso.org" + j['url']
+            page = requests.get(realurl)
+            #print(realurl)
+            soup = BeautifulSoup(page.text, "html5lib")
+            infosimg = soup.find("div", class_="object-info").find_all("table")[0]
+            infosobj = soup.find("div", class_="object-info").find_all("table")[1]
+            infosdict['url'] = realurl
+            #print(realurl)
+            title = soup.find("h1").text
+            infosdict['image']['title'] = title
+            infosdict['image']['credit'] = soup.find("div", class_="credit").text
+            imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
+            if not imagesoup:
+                print("Image not found for " + realurl)
+                continue
+            #print(realurl)
+        #if imagesoup.isnotNone:
+            print(realurl)
+            descriptionps = imagesoup.find_all_next("p")
+            descriptiontext = ""
+            descriptionhtml = ""
+            for descriptionp in descriptionps:
+                if "credit" in str(descriptionp.previous_element):
+                    break
+                descriptiontext_p = descriptionp.text.strip()
+                if descriptiontext_p:
+                    descriptiontext += descriptiontext_p + "\n"
+
+                descriptionhtml_p = str(descriptionp).strip()
+                if descriptionhtml_p != "<p></p>":
+                    descriptionhtml += descriptionhtml_p
+            infosdict['image']['description_text'] = descriptiontext
+            infosdict['image']['description_html'] = descriptionhtml
+            for tableimg in infosimg.find_all("tr"):
+                if "Id" in tableimg.text:
+                    imgid = tableimg.find_all("td")[1].text
+                    infosdict['image']['id'] = imgid
+                if "Type" in tableimg.text:
+                    infosdict['image']['type'] = tableimg.find_all("td")[1].text
+                if "Date" in tableimg.text or "date" in tableimg.text:
+                    infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
+            for tableobj in infosobj.find_all("tr"):
+                if "Name" in tableobj.text or "Nom" in tableobj.text:
+                    infosdict['object']['name'] = tableobj.find_all("td")[1].text
+                if "Type" in tableobj.text:
+                    infosdict['object']['type'] = tableobj.find_all("td")[1].text
+            imgurl = None
+            img = None
+            for imgurl_pattern, img_pattern in img_url_patterns:
+                imgurl_test = imgurl_pattern.format(imgid=imgid)
+                if requests.head(imgurl_test).status_code == 200:
+                    imgurl = imgurl_test
+                    img = img_pattern.format(imgid=imgid)
+                    break
+            if imgurl is None:
+                continue
+
+            infosdict['image']['imgurl'] = imgurl
+            imgdirectory = directory + imgid
+        #if not os.path.exists(imgdirectory):
+            os.mkdir(imgdirectory)
+            imgdirection = imgdirectory + "/" + img
+            urllib.request.urlretrieve(imgurl, imgdirection)
+            jsonfname = imgid + ".json"
+            jsondirection = imgdirectory + "/" + jsonfname
+            json_img_file = open(jsondirection, "w")
+            json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
+            json_img_file.close()
+            print(realurl + " : OK")
+        i += 1
+        eso = imgloturl + str(i)
+    json_all.close()
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
--- a/src/iconolab_episteme/management/commands/importimages.py	Thu Jun 28 15:15:39 2018 +0200
+++ b/src/iconolab_episteme/management/commands/importimages.py	Thu Jun 28 15:19:43 2018 +0200
@@ -27,13 +27,6 @@
     def add_arguments(self, parser):
         parser.add_argument('source_dir')
         parser.add_argument(
-            '--encoding',
-            dest='encoding',
-            default='utf-8',
-            help='JSON file encoding'
-
-        )
-        parser.add_argument(
             '--collection-id',
             dest='collection_id',
             default=False,
@@ -45,49 +38,32 @@
             default=False,
             help='use this option if you only want the image copied and not converted'
         )
-        parser.add_argument(
-            '--folders',
-            dest='import_folders',
-            default=False,
-            action='store_const',
-            const=True,
-            help='option to create folders'
-        )
-        # parser.add_argument(
-        #     '--folders-regexp',
-        #     dest='folders_regexp',
-        #     default=False,
-        #     help='regexp used to extract the folder name/number'
-        # )
-        # parser.add_argument(
-        #     '--folders-metadata',
-        #     dest='folders_metadata',
-        #     default='REF',
-        #     help='metadata from which to extract the folder name/number'
-        # )
 
     def handle(self, *args, **options):
 
+        #Set no image size limit to PIL to be able to process big images.
+        ImagePIL.MAX_IMAGE_PIXELS = None
+
         print('# Logging with logger '+logger.name)
         logger.debug('# Initializing command with args: %r', options)
       
         self.source_dir = options.get('source_dir')
 
-        if options.get('collection_id'):
-            print('## Finding collection with id ' + 
-                    options.get('collection_id'))
+        collection_id = options.get('collection_id')
+
+        if not collection_id:
+            raise CommandError("No collection id, aborting")
+
+        print('## Finding collection with id %s' % collection_id) 
+
+        try:
             try:
-                collection = Collection.objects.get(
-                    pk=options.get('collection_id'))
-            except Collection.DoesNotExist:
-                raise ValueError('!!! Collection with primary key ' +
-                                    options.get('collection_id')+' was not found, aborting !!!')
-        else:
-            raise ValueError(
-                '!!! No collection fixture or collection id, aborting because we can\'t properly generate data. !!!')
-
-
-        '''Listing image files in target directory'''
+                collection = Collection.objects.get(pk=int(collection_id))
+            except ValueError:
+                collection = Collection.objects.get(name=collection_id)
+        except Collection.DoesNotExist:
+            raise CommandError('!!! Collection with id ' + collection_id
+                                +' was not found, aborting !!!')
 
         print(
             '## Converting image and moving it to static dir, creating Image and Item objects')
@@ -95,14 +71,18 @@
 
         for dirname, dirs, files in os.walk(self.source_dir):
             for filename in files:
+                print("::Examining %s" % filename)
                 filename_without_extension, extension = os.path.splitext(filename)
                 if imghdr.what(os.path.join(dirname, filename)) is None:
+                    print("-> This is not an image: continue")
                     continue
 
                 json_path = os.path.join(dirname, filename_without_extension + ".json")
                 if not os.path.isfile(json_path):
+                    print("-> has not a matching json: continue")
                     continue
 
+                print("-> Processing %s" %json_path)
                 with open(json_path) as json_data:
                     eso_data = json.load(json_data)
                     eso_object = eso_data['object']
@@ -124,7 +104,11 @@
                         except FileExistsError:
                             print(image_dir, "directory already exists")
 
-                        self.create_item_and_metadata(
-                            natural_key, collection, eso_data, image_list, options, self.source_dir)
+                        try:
+                            self.create_item_and_metadata(
+                                natural_key, collection, eso_data, image_list, options, self.source_dir)
+                        except Exception as e:
+                            print("!!! Exception processing %s : %s" % (json_path, e))
+                            continue
 
         print('# All done!')
\ No newline at end of file
--- a/src/iconolab_episteme/settings/__init__.py	Thu Jun 28 15:15:39 2018 +0200
+++ b/src/iconolab_episteme/settings/__init__.py	Thu Jun 28 15:19:43 2018 +0200
@@ -146,7 +146,7 @@
 USE_TZ = True
 
 # IMPORT_DEFAULT_FIELD_TO_FILENAME_IDENTIFIER = "INV"
-NO_IMG_CONVERSION_EXTS = [".jpg"]
+NO_IMG_CONVERSION_EXTS = [".jpg", ".jpeg"]
 IMG_CONVERSION_EXTS = [".tif", ".tiff"]
 IMG_JPG_DEFAULT_QUALITY = 80
 PREGENERATE_THUMBNAILS_SIZES = [