# HG changeset patch # User ymh # Date 1530191983 -7200 # Node ID 16fb4f5efa697cad031c3641b26f8cb43c2c9e87 # Parent d3fe1866eb5b29144d8b923c59442ba7b3cef9b7 * add scraping script * correct size management problem on importimages diff -r d3fe1866eb5b -r 16fb4f5efa69 .hgignore --- a/.hgignore Thu Jun 28 15:15:39 2018 +0200 +++ b/.hgignore Thu Jun 28 15:19:43 2018 +0200 @@ -43,3 +43,8 @@ ^sbin/sync/.vscode ^sbin/sync/.vagrant ^sbin/sync/fabric.py$ + +^data/script/.direnv +^data/script/.envrc$ +^data/script/scrapESO + diff -r d3fe1866eb5b -r 16fb4f5efa69 data/eso_collection.json --- a/data/eso_collection.json Thu Jun 28 15:15:39 2018 +0200 +++ b/data/eso_collection.json Thu Jun 28 15:19:43 2018 +0200 @@ -2,5 +2,5 @@ "name": "eso", "verbose_name": "European Southern Observatory", "description": "L’ESO construit et gère les télescopes astronomiques au sol les plus puissants au monde qui permettent d’importantes découvertes scientifiques.", - "image": "foo.jpg" + "image": "eso_logo.jpg" } \ No newline at end of file diff -r d3fe1866eb5b -r 16fb4f5efa69 data/script/requirements.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data/script/requirements.txt Thu Jun 28 15:19:43 2018 +0200 @@ -0,0 +1,4 @@ +beautifulsoup4 +requests +demjson +html5lib \ No newline at end of file diff -r d3fe1866eb5b -r 16fb4f5efa69 data/script/scrapeso.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data/script/scrapeso.py Thu Jun 28 15:19:43 2018 +0200 @@ -0,0 +1,141 @@ +import json +import os +import sys, getopt +import demjson +from bs4 import BeautifulSoup +import urllib.request +import requests + +def main(argv): + ''' + please put image lot url + (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg + and directory as second arg + ''' + directory = './scrapESO/' + imgloturl = 'https://www.eso.org/public/france/images/viewall/list/' + img_url_patterns = [ + ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg') + ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg') + ] + try: + opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="]) + except getopt.GetoptError: + print('test.py -d -u ') + sys.exit(2) + for opt, arg in opts: + if opt == '-h': + print('''test.py -d (default is : "./scrapESO/") + -u (default is : "https://www.eso.org/public/france/images/viewall/list/"''') + sys.exit() + elif opt in ("-d", "--dir"): + directory = arg + elif opt in ("-u", "--url"): + imgloturl = arg + "list/" + elif opt in ("-o", "--original"): + img_url_patterns = [ + ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'), + ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'), + ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')] + + if not os.path.exists(directory): + os.mkdir(directory) + jsonfile = directory + "allfiles.json" + json_all = open(jsonfile, "w") + i = 1 + eso = imgloturl + str(i) + while requests.head(eso).status_code < 400: + page = requests.get(eso) + soup = BeautifulSoup(page.text, "html5lib") + start = 0 + for scripts in soup.find_all('script'): + if "var images" in scripts.text: + scripttext = scripts.text + break + for scriptchar in scripttext: + if scriptchar == "[": + break + start += 1 + listjs = scripttext[start:-2] + json_all.write(listjs + ",") + listdem = demjson.decode(listjs) + for j in listdem: + infosdict = {} + infosdict['image'] = {} + infosdict['object'] = {} + realurl = "https://www.eso.org" + j['url'] + page = requests.get(realurl) + #print(realurl) + soup = BeautifulSoup(page.text, "html5lib") + infosimg = soup.find("div", class_="object-info").find_all("table")[0] + infosobj = soup.find("div", class_="object-info").find_all("table")[1] + infosdict['url'] = realurl + #print(realurl) + title = soup.find("h1").text + infosdict['image']['title'] = title + infosdict['image']['credit'] = soup.find("div", class_="credit").text + imagesoup = soup.find("div", class_="archive-image archive-image-dark popup") + if not imagesoup: + print("Image not found for " + realurl) + continue + #print(realurl) + #if imagesoup.isnotNone: + print(realurl) + descriptionps = imagesoup.find_all_next("p") + descriptiontext = "" + descriptionhtml = "" + for descriptionp in descriptionps: + if "credit" in str(descriptionp.previous_element): + break + descriptiontext_p = descriptionp.text.strip() + if descriptiontext_p: + descriptiontext += descriptiontext_p + "\n" + + descriptionhtml_p = str(descriptionp).strip() + if descriptionhtml_p != "

": + descriptionhtml += descriptionhtml_p + infosdict['image']['description_text'] = descriptiontext + infosdict['image']['description_html'] = descriptionhtml + for tableimg in infosimg.find_all("tr"): + if "Id" in tableimg.text: + imgid = tableimg.find_all("td")[1].text + infosdict['image']['id'] = imgid + if "Type" in tableimg.text: + infosdict['image']['type'] = tableimg.find_all("td")[1].text + if "Date" in tableimg.text or "date" in tableimg.text: + infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text + for tableobj in infosobj.find_all("tr"): + if "Name" in tableobj.text or "Nom" in tableobj.text: + infosdict['object']['name'] = tableobj.find_all("td")[1].text + if "Type" in tableobj.text: + infosdict['object']['type'] = tableobj.find_all("td")[1].text + imgurl = None + img = None + for imgurl_pattern, img_pattern in img_url_patterns: + imgurl_test = imgurl_pattern.format(imgid=imgid) + if requests.head(imgurl_test).status_code == 200: + imgurl = imgurl_test + img = img_pattern.format(imgid=imgid) + break + if imgurl is None: + continue + + infosdict['image']['imgurl'] = imgurl + imgdirectory = directory + imgid + #if not os.path.exists(imgdirectory): + os.mkdir(imgdirectory) + imgdirection = imgdirectory + "/" + img + urllib.request.urlretrieve(imgurl, imgdirection) + jsonfname = imgid + ".json" + jsondirection = imgdirectory + "/" + jsonfname + json_img_file = open(jsondirection, "w") + json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': '))) + json_img_file.close() + print(realurl + " : OK") + i += 1 + eso = imgloturl + str(i) + json_all.close() + + +if __name__ == '__main__': + main(sys.argv[1:]) diff -r d3fe1866eb5b -r 16fb4f5efa69 src/iconolab_episteme/management/commands/importimages.py --- a/src/iconolab_episteme/management/commands/importimages.py Thu Jun 28 15:15:39 2018 +0200 +++ b/src/iconolab_episteme/management/commands/importimages.py Thu Jun 28 15:19:43 2018 +0200 @@ -27,13 +27,6 @@ def add_arguments(self, parser): parser.add_argument('source_dir') parser.add_argument( - '--encoding', - dest='encoding', - default='utf-8', - help='JSON file encoding' - - ) - parser.add_argument( '--collection-id', dest='collection_id', default=False, @@ -45,49 +38,32 @@ default=False, help='use this option if you only want the image copied and not converted' ) - parser.add_argument( - '--folders', - dest='import_folders', - default=False, - action='store_const', - const=True, - help='option to create folders' - ) - # parser.add_argument( - # '--folders-regexp', - # dest='folders_regexp', - # default=False, - # help='regexp used to extract the folder name/number' - # ) - # parser.add_argument( - # '--folders-metadata', - # dest='folders_metadata', - # default='REF', - # help='metadata from which to extract the folder name/number' - # ) def handle(self, *args, **options): + #Set no image size limit to PIL to be able to process big images. + ImagePIL.MAX_IMAGE_PIXELS = None + print('# Logging with logger '+logger.name) logger.debug('# Initializing command with args: %r', options) self.source_dir = options.get('source_dir') - if options.get('collection_id'): - print('## Finding collection with id ' + - options.get('collection_id')) + collection_id = options.get('collection_id') + + if not collection_id: + raise CommandError("No collection id, aborting") + + print('## Finding collection with id %s' % collection_id) + + try: try: - collection = Collection.objects.get( - pk=options.get('collection_id')) - except Collection.DoesNotExist: - raise ValueError('!!! Collection with primary key ' + - options.get('collection_id')+' was not found, aborting !!!') - else: - raise ValueError( - '!!! No collection fixture or collection id, aborting because we can\'t properly generate data. !!!') - - - '''Listing image files in target directory''' + collection = Collection.objects.get(pk=int(collection_id)) + except ValueError: + collection = Collection.objects.get(name=collection_id) + except Collection.DoesNotExist: + raise CommandError('!!! Collection with id ' + collection_id + +' was not found, aborting !!!') print( '## Converting image and moving it to static dir, creating Image and Item objects') @@ -95,14 +71,18 @@ for dirname, dirs, files in os.walk(self.source_dir): for filename in files: + print("::Examining %s" % filename) filename_without_extension, extension = os.path.splitext(filename) if imghdr.what(os.path.join(dirname, filename)) is None: + print("-> This is not an image: continue") continue json_path = os.path.join(dirname, filename_without_extension + ".json") if not os.path.isfile(json_path): + print("-> has not a matching json: continue") continue + print("-> Processing %s" %json_path) with open(json_path) as json_data: eso_data = json.load(json_data) eso_object = eso_data['object'] @@ -124,7 +104,11 @@ except FileExistsError: print(image_dir, "directory already exists") - self.create_item_and_metadata( - natural_key, collection, eso_data, image_list, options, self.source_dir) + try: + self.create_item_and_metadata( + natural_key, collection, eso_data, image_list, options, self.source_dir) + except Exception as e: + print("!!! Exception processing %s : %s" % (json_path, e)) + continue print('# All done!') \ No newline at end of file diff -r d3fe1866eb5b -r 16fb4f5efa69 src/iconolab_episteme/settings/__init__.py --- a/src/iconolab_episteme/settings/__init__.py Thu Jun 28 15:15:39 2018 +0200 +++ b/src/iconolab_episteme/settings/__init__.py Thu Jun 28 15:19:43 2018 +0200 @@ -146,7 +146,7 @@ USE_TZ = True # IMPORT_DEFAULT_FIELD_TO_FILENAME_IDENTIFIER = "INV" -NO_IMG_CONVERSION_EXTS = [".jpg"] +NO_IMG_CONVERSION_EXTS = [".jpg", ".jpeg"] IMG_CONVERSION_EXTS = [".tif", ".tiff"] IMG_JPG_DEFAULT_QUALITY = 80 PREGENERATE_THUMBNAILS_SIZES = [