--- a/.hgignore Thu Jun 28 15:15:39 2018 +0200
+++ b/.hgignore Thu Jun 28 15:19:43 2018 +0200
@@ -43,3 +43,8 @@
^sbin/sync/.vscode
^sbin/sync/.vagrant
^sbin/sync/fabric.py$
+
+^data/script/.direnv
+^data/script/.envrc$
+^data/script/scrapESO
+
--- a/data/eso_collection.json Thu Jun 28 15:15:39 2018 +0200
+++ b/data/eso_collection.json Thu Jun 28 15:19:43 2018 +0200
@@ -2,5 +2,5 @@
"name": "eso",
"verbose_name": "European Southern Observatory",
"description": "L’ESO construit et gère les télescopes astronomiques au sol les plus puissants au monde qui permettent d’importantes découvertes scientifiques.",
- "image": "foo.jpg"
+ "image": "eso_logo.jpg"
}
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data/script/requirements.txt Thu Jun 28 15:19:43 2018 +0200
@@ -0,0 +1,4 @@
+beautifulsoup4
+requests
+demjson
+html5lib
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data/script/scrapeso.py Thu Jun 28 15:19:43 2018 +0200
@@ -0,0 +1,141 @@
+import json
+import os
+import sys, getopt
+import demjson
+from bs4 import BeautifulSoup
+import urllib.request
+import requests
+
+def main(argv):
+ '''
+ please put image lot url
+ (for example https://www.eso.org/public/france/images/archive/category/alma/) as first arg
+ and directory as second arg
+ '''
+ directory = './scrapESO/'
+ imgloturl = 'https://www.eso.org/public/france/images/viewall/list/'
+ img_url_patterns = [
+ ('https://cdn.eso.org/images/publicationjpg/{imgid}.jpg', '{imgid}.jpg')
+ ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')
+ ]
+ try:
+ opts, args = getopt.getopt(argv, "hd:u:", ["dir=","url="])
+ except getopt.GetoptError:
+ print('test.py -d <directory> -u <imagegroupurl>')
+ sys.exit(2)
+ for opt, arg in opts:
+ if opt == '-h':
+ print('''test.py -d <directory> (default is : "./scrapESO/")
+ -u <imagegroupurl> (default is : "https://www.eso.org/public/france/images/viewall/list/"''')
+ sys.exit()
+ elif opt in ("-d", "--dir"):
+ directory = arg
+ elif opt in ("-u", "--url"):
+ imgloturl = arg + "list/"
+ elif opt in ("-o", "--original"):
+ img_url_patterns = [
+ ('https://www.eso.org/public/archives/images/original/{imgid}.tif', '{imgid}.tif'),
+ ('https://www.eso.org/public/archives/images/publicationtiff/{imgid}.tif', '{imgid}.tif'),
+ ('https://cdn.eso.org/images/large/{imgid}.jpg', '{imgid}.jpg')]
+
+ if not os.path.exists(directory):
+ os.mkdir(directory)
+ jsonfile = directory + "allfiles.json"
+ json_all = open(jsonfile, "w")
+ i = 1
+ eso = imgloturl + str(i)
+ while requests.head(eso).status_code < 400:
+ page = requests.get(eso)
+ soup = BeautifulSoup(page.text, "html5lib")
+ start = 0
+ for scripts in soup.find_all('script'):
+ if "var images" in scripts.text:
+ scripttext = scripts.text
+ break
+ for scriptchar in scripttext:
+ if scriptchar == "[":
+ break
+ start += 1
+ listjs = scripttext[start:-2]
+ json_all.write(listjs + ",")
+ listdem = demjson.decode(listjs)
+ for j in listdem:
+ infosdict = {}
+ infosdict['image'] = {}
+ infosdict['object'] = {}
+ realurl = "https://www.eso.org" + j['url']
+ page = requests.get(realurl)
+ #print(realurl)
+ soup = BeautifulSoup(page.text, "html5lib")
+ infosimg = soup.find("div", class_="object-info").find_all("table")[0]
+ infosobj = soup.find("div", class_="object-info").find_all("table")[1]
+ infosdict['url'] = realurl
+ #print(realurl)
+ title = soup.find("h1").text
+ infosdict['image']['title'] = title
+ infosdict['image']['credit'] = soup.find("div", class_="credit").text
+ imagesoup = soup.find("div", class_="archive-image archive-image-dark popup")
+ if not imagesoup:
+ print("Image not found for " + realurl)
+ continue
+ #print(realurl)
+ #if imagesoup.isnotNone:
+ print(realurl)
+ descriptionps = imagesoup.find_all_next("p")
+ descriptiontext = ""
+ descriptionhtml = ""
+ for descriptionp in descriptionps:
+ if "credit" in str(descriptionp.previous_element):
+ break
+ descriptiontext_p = descriptionp.text.strip()
+ if descriptiontext_p:
+ descriptiontext += descriptiontext_p + "\n"
+
+ descriptionhtml_p = str(descriptionp).strip()
+ if descriptionhtml_p != "<p></p>":
+ descriptionhtml += descriptionhtml_p
+ infosdict['image']['description_text'] = descriptiontext
+ infosdict['image']['description_html'] = descriptionhtml
+ for tableimg in infosimg.find_all("tr"):
+ if "Id" in tableimg.text:
+ imgid = tableimg.find_all("td")[1].text
+ infosdict['image']['id'] = imgid
+ if "Type" in tableimg.text:
+ infosdict['image']['type'] = tableimg.find_all("td")[1].text
+ if "Date" in tableimg.text or "date" in tableimg.text:
+ infosdict['image']['date_publication'] = tableimg.find_all("td")[1].text
+ for tableobj in infosobj.find_all("tr"):
+ if "Name" in tableobj.text or "Nom" in tableobj.text:
+ infosdict['object']['name'] = tableobj.find_all("td")[1].text
+ if "Type" in tableobj.text:
+ infosdict['object']['type'] = tableobj.find_all("td")[1].text
+ imgurl = None
+ img = None
+ for imgurl_pattern, img_pattern in img_url_patterns:
+ imgurl_test = imgurl_pattern.format(imgid=imgid)
+ if requests.head(imgurl_test).status_code == 200:
+ imgurl = imgurl_test
+ img = img_pattern.format(imgid=imgid)
+ break
+ if imgurl is None:
+ continue
+
+ infosdict['image']['imgurl'] = imgurl
+ imgdirectory = directory + imgid
+ #if not os.path.exists(imgdirectory):
+ os.mkdir(imgdirectory)
+ imgdirection = imgdirectory + "/" + img
+ urllib.request.urlretrieve(imgurl, imgdirection)
+ jsonfname = imgid + ".json"
+ jsondirection = imgdirectory + "/" + jsonfname
+ json_img_file = open(jsondirection, "w")
+ json_img_file.write(json.dumps(infosdict, sort_keys=True, indent=4, separators=(',', ': ')))
+ json_img_file.close()
+ print(realurl + " : OK")
+ i += 1
+ eso = imgloturl + str(i)
+ json_all.close()
+
+
+if __name__ == '__main__':
+ main(sys.argv[1:])
--- a/src/iconolab_episteme/management/commands/importimages.py Thu Jun 28 15:15:39 2018 +0200
+++ b/src/iconolab_episteme/management/commands/importimages.py Thu Jun 28 15:19:43 2018 +0200
@@ -27,13 +27,6 @@
def add_arguments(self, parser):
parser.add_argument('source_dir')
parser.add_argument(
- '--encoding',
- dest='encoding',
- default='utf-8',
- help='JSON file encoding'
-
- )
- parser.add_argument(
'--collection-id',
dest='collection_id',
default=False,
@@ -45,49 +38,32 @@
default=False,
help='use this option if you only want the image copied and not converted'
)
- parser.add_argument(
- '--folders',
- dest='import_folders',
- default=False,
- action='store_const',
- const=True,
- help='option to create folders'
- )
- # parser.add_argument(
- # '--folders-regexp',
- # dest='folders_regexp',
- # default=False,
- # help='regexp used to extract the folder name/number'
- # )
- # parser.add_argument(
- # '--folders-metadata',
- # dest='folders_metadata',
- # default='REF',
- # help='metadata from which to extract the folder name/number'
- # )
def handle(self, *args, **options):
+ #Set no image size limit to PIL to be able to process big images.
+ ImagePIL.MAX_IMAGE_PIXELS = None
+
print('# Logging with logger '+logger.name)
logger.debug('# Initializing command with args: %r', options)
self.source_dir = options.get('source_dir')
- if options.get('collection_id'):
- print('## Finding collection with id ' +
- options.get('collection_id'))
+ collection_id = options.get('collection_id')
+
+ if not collection_id:
+ raise CommandError("No collection id, aborting")
+
+ print('## Finding collection with id %s' % collection_id)
+
+ try:
try:
- collection = Collection.objects.get(
- pk=options.get('collection_id'))
- except Collection.DoesNotExist:
- raise ValueError('!!! Collection with primary key ' +
- options.get('collection_id')+' was not found, aborting !!!')
- else:
- raise ValueError(
- '!!! No collection fixture or collection id, aborting because we can\'t properly generate data. !!!')
-
-
- '''Listing image files in target directory'''
+ collection = Collection.objects.get(pk=int(collection_id))
+ except ValueError:
+ collection = Collection.objects.get(name=collection_id)
+ except Collection.DoesNotExist:
+ raise CommandError('!!! Collection with id ' + collection_id
+ +' was not found, aborting !!!')
print(
'## Converting image and moving it to static dir, creating Image and Item objects')
@@ -95,14 +71,18 @@
for dirname, dirs, files in os.walk(self.source_dir):
for filename in files:
+ print("::Examining %s" % filename)
filename_without_extension, extension = os.path.splitext(filename)
if imghdr.what(os.path.join(dirname, filename)) is None:
+ print("-> This is not an image: continue")
continue
json_path = os.path.join(dirname, filename_without_extension + ".json")
if not os.path.isfile(json_path):
+ print("-> has not a matching json: continue")
continue
+ print("-> Processing %s" %json_path)
with open(json_path) as json_data:
eso_data = json.load(json_data)
eso_object = eso_data['object']
@@ -124,7 +104,11 @@
except FileExistsError:
print(image_dir, "directory already exists")
- self.create_item_and_metadata(
- natural_key, collection, eso_data, image_list, options, self.source_dir)
+ try:
+ self.create_item_and_metadata(
+ natural_key, collection, eso_data, image_list, options, self.source_dir)
+ except Exception as e:
+ print("!!! Exception processing %s : %s" % (json_path, e))
+ continue
print('# All done!')
\ No newline at end of file
--- a/src/iconolab_episteme/settings/__init__.py Thu Jun 28 15:15:39 2018 +0200
+++ b/src/iconolab_episteme/settings/__init__.py Thu Jun 28 15:19:43 2018 +0200
@@ -146,7 +146,7 @@
USE_TZ = True
# IMPORT_DEFAULT_FIELD_TO_FILENAME_IDENTIFIER = "INV"
-NO_IMG_CONVERSION_EXTS = [".jpg"]
+NO_IMG_CONVERSION_EXTS = [".jpg", ".jpeg"]
IMG_CONVERSION_EXTS = [".tif", ".tiff"]
IMG_JPG_DEFAULT_QUALITY = 80
PREGENERATE_THUMBNAILS_SIZES = [