src/iconolab_mcc/management/commands/importimages.py
changeset 5 cfd40849d24c
child 7 023dbfdc9f19
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/iconolab_mcc/management/commands/importimages.py	Tue Jun 12 13:11:49 2018 +0200
@@ -0,0 +1,447 @@
+# -*- coding: UTF-8 -*-
+from django.core.management.base import BaseCommand, CommandError
+from django.core.management import call_command
+from django.conf import settings
+from iconolab.models import Collection, Image, ImageStats, Item, ItemMetadata, MetaCategory, Folder
+from PIL import Image as ImagePIL
+from sorl.thumbnail import get_thumbnail
+import os, csv, pprint, re, json, shutil, logging
+
+if settings.IMPORT_LOGGER_NAME and settings.LOGGING['loggers'].get(settings.IMPORT_LOGGER_NAME, ''):
+    logger = logging.getLogger(settings.IMPORT_LOGGER_NAME)
+else:
+    logger = logging.getLogger(__name__)
+
+class Command(BaseCommand):
+    help = 'import images from a directory into the media folder and creates item and image objects'
+
+    def add_arguments(self, parser):
+        parser.add_argument('csv_path')
+        parser.add_argument(
+            '--jpeg-quality',
+            dest='jpeg_quality',
+            default=settings.IMG_JPG_DEFAULT_QUALITY,
+            help='Jpeg default quality'
+
+        )
+        parser.add_argument(
+            '--encoding',
+            dest='encoding',
+            default='utf-8',
+            help='CSV file encoding'
+
+        )
+        parser.add_argument(
+            '--collection-json',
+            dest='collection_json',
+            default=False,
+            help='creates a new collection from a json file, must be an object with fields : '+ \
+                 '"name" (identifier), '+ \
+                 '"verbose_name" (proper title name), '+ \
+                 '"description" (description on homepage, html is supported), '+ \
+                 '"image" (image on homepages, must be "uploads/<imgname>"), '+ \
+                 '"height" and "width" (height and width of the image)',
+        )
+        parser.add_argument(
+            '--collection-id',
+            dest='collection_id',
+            default=False,
+            help='insert extracted data into the specified collection instead of trying to load a collection fixture',
+        )
+        parser.add_argument(
+            '--metacategories-json',
+            dest='metacategories_json',
+            default=False,
+            help='add metacategories to the collection from a json file (json must be a list of object with "label" and "triggers_notifications" fields)',
+        )
+        parser.add_argument(
+            '--delimiter',
+            dest='csv_delimiter',
+            default=';',
+            help='csv file delimiter'
+        )
+        parser.add_argument(
+            '--no-jpg-conversion',
+            dest='no-jpg-conversion',
+            default=False,
+            help='use this option if you only want the image copied and not converted'
+        )
+        parser.add_argument(
+            '--img-filename-identifier',
+            dest='img_filename_identifier',
+            default=settings.IMPORT_DEFAULT_FIELD_TO_FILENAME_IDENTIFIER,
+            help='codename of the csv field we\'ll try to match to find the related image to a given object'
+        )
+        parser.add_argument(
+            '--filename-regexp-prefix',
+            dest='filename_regexp_prefix',
+            default=r'.*',
+            help='regexp prefix to properly parse image names with info from csv. The pattern should describe the part before the filename identifier string, default is .*'
+        )
+        parser.add_argument(
+            '--filename-regexp-suffix',
+            dest='filename_regexp_suffix',
+            default=r'[\.\-_].*',
+            help='regexp suffix to properly parse image names with info from csv. The pattern should describe the part after the filename identifier string, default is [\.\-_].*'
+        )
+        parser.add_argument(
+            '--folders',
+            dest='import_folders',
+            default=False,
+            action='store_const',
+            const=True,
+            help='option to create folders'
+        )
+        parser.add_argument(
+            '--folders-regexp',
+            dest='folders_regexp',
+            default=False,
+            help='regexp used to extract the folder name/number'
+        )
+        parser.add_argument(
+            '--folders-metadata',
+            dest='folders_metadata',
+            default='REF',
+            help='metadata from which to extract the folder name/number'
+        )
+    def handle(self, *args, **options):
+        """
+            Step-by-step for import:
+
+            1) Argument checks for file existence and database state to check that everything can proceed without issue before reading the files
+            1) We import data from csv in a 'pivot' list of dicts 'cleaned_row_data' with the following logic:
+                * in the settings, there is value "IMPORT_FIELDS_DICT" that is a dict where each key is an identifier for the metadatas
+                to which we associate a list of column header that will identified as that metadata
+                * The cleaned_row_data list will associate the identifier with the actual value for its related column
+            2) Once we have cleaned_row_data, we filter out rows that don't have any associated image into a 'filtered_row_data' list, and add a key "SRC_IMG_FILES" that contains the list of images associated
+            to each row for the filtered data.
+            3) At this point we have a list of all the items that will be created into the database and the related images to import, so we create the collection object if necessary
+            4) For each item:
+                We create the object in the database
+                * Metadatas are extracted from the filtered_csv_data using the pivot identifiers from settings.IMPORT_FIELD_DICT
+                We copy/convert the image into the MEDIA_ROOT/uploads/ dir: thumbnails size listed in settings.PREGENERATE_THUMBNAIL_SIZES are pre-generated for each image
+
+            Note: each unused row and each unused image in the import folder is kept track of in no_data_images, no_image_rows and duplicate_rows lists and logged at the end of the command.
+        """
+        try:
+            print('# Logging with logger '+logger.name)
+            logger.debug('# Initializing command with args: %r', options)
+            # Check we have a collection to store data into:
+            source_dir = os.path.dirname(os.path.realpath(options.get('csv_path')))
+            print('# Checking collection args')
+            if options.get('collection_json'):
+                print('## Finding collection json data in '+source_dir)
+                collection_json_path = os.path.join(source_dir, options.get('collection_json'))
+                if not os.path.isfile(collection_json_path):
+                    print('### No '+options.get('collection_json')+'.json file was found in the source directory')
+                    raise ValueError('!!! Json file '+collection_json_path+' was not found !!!')
+                try:
+                    with open(collection_json_path) as json_fixture_file:
+                        collection_data = json.loads(json_fixture_file.read())
+                        for key in ['name', 'verbose_name', 'description', 'image', 'height', 'width']:
+                            if not key in collection_data.keys():
+                                print('!!! Json file '+collection_json_path+' has no '+key+' field !!!')
+                                raise ValueError()
+                        if not collection_data.get('name', ''):
+                            print('!!! Collection data key "name" is empty')
+                            raise ValueError()
+                        if Collection.objects.filter(name=collection_data.get('name')).exists():
+                            print('!!! A Collection with the provided name already exists!')
+                            raise ValueError()
+                        if collection_data.get('image', '') and not (collection_data.get('width', 0) and collection_data.get('height', 0)):
+                            print('!!! Collection data has an image but no height and width')
+                            raise ValueError()
+                except ValueError as e:
+                    raise ValueError('!!! JSON Data is invalid. !!!')
+            elif options.get('collection_id'):
+                print('## Finding collection with id '+options.get('collection_id'))
+                try:
+                    collection = Collection.objects.get(pk=options.get('collection_id'))
+                except Collection.DoesNotExist:
+                    raise ValueError('!!! Collection with primary key '+options.get('collection_id')+' was not found, aborting !!!')
+            else:
+                raise ValueError('!!! No collection fixture or collection id, aborting because we can\'t properly generate data. !!!')
+
+            if options.get('metacategories_json'):
+                print('## Finding metacategories fixture json data in '+source_dir)
+                metacategories_json_path = os.path.join(source_dir, options.get('metacategories_json'))
+                if not os.path.isfile(metacategories_json_path):
+                    print('### No '+options.get('metacategories_json')+'.json file was found in the source directory')
+                    raise ValueError('!!! Fixture file '+metacategories_json_path+' was not found !!!')
+                with open(metacategories_json_path) as metacategories_json_file:
+                    metacategories_data = json.loads(metacategories_json_file.read())
+                    for metacategory in metacategories_data:
+                        if metacategory.get('label', None) is None:
+                            raise ValueError('!!! Metacategory without label !!!')
+
+            if options['import_folders'] and not options['folders_regexp']:
+                raise ValueError('!!! No regexp specified to extract folder name !!!')
+
+            # We read the csv
+            delimiter = options.get('csv_delimiter')
+            if delimiter == '#9':
+                delimiter = chr(9)
+            if delimiter == '#29':
+                delimiter = chr(29)
+            if delimiter == '#30':
+                delimiter = chr(30)
+            if delimiter == '#31':
+                delimiter = chr(31)
+            csvreader = csv.DictReader(open(options.get('csv_path'), encoding=options.get('encoding')), delimiter=delimiter)
+            print('# Extracting data from csv file and storing it in standardized format')
+            # We store data using the Jocondelab keys, as defined in settings.IMPORT_FIELDS_DICT
+            cleaned_csv_data=[]
+            duplicate_rows=[]
+            for row in csvreader:
+                cleaned_row_data = {}
+                for key in settings.IMPORT_FIELDS_DICT.keys():
+                    cleaned_row_data[key] = ''
+                    for row_key in row.keys():
+                        if row_key in settings.IMPORT_FIELDS_DICT[key]:
+                            if key == 'REF':
+                                ref_number, _, _ = row[row_key].partition(';')
+                                cleaned_row_data[key] = ref_number.rstrip()
+                            else:
+                                cleaned_row_data[key] = row[row_key]
+                            break
+                if cleaned_row_data[options.get('img_filename_identifier')] in [row[options.get('img_filename_identifier')] for row in cleaned_csv_data]:
+                    print("## We already have "+options.get('img_filename_identifier')+" value "+cleaned_row_data[options.get('img_filename_identifier')]+" in the data to import, ignoring duplicate line")
+                    duplicate_rows.append(cleaned_row_data)
+                else:
+                    cleaned_csv_data.append(cleaned_row_data)
+            # Listing image files in csv directory
+            image_list = [
+                f for f in os.listdir(source_dir)
+                if os.path.isfile(os.path.join(source_dir, f))
+                and (f.endswith('.jpg') or f.endswith('.tif') or f.endswith('.bmp') or f.endswith('.png'))
+            ] # Maybe check if image another way
+            filtered_csv_data = []
+            no_image_rows = []
+            no_data_images = []
+            assigned_images = []
+            # Now we trim the cleaned_csv_data dict to keep only entries that have at least one image
+            for item in cleaned_csv_data:
+                item['SRC_IMG_FILES'] = []
+                has_image = False
+                for image in image_list:
+                    img_name_pattern = options.get('filename_regexp_prefix')+re.escape(item[options.get('img_filename_identifier')])+options.get('filename_regexp_suffix')
+                    if re.match(img_name_pattern, image):
+                        item['SRC_IMG_FILES'].append(image)
+                        assigned_images.append(image)
+                        has_image = True
+                if has_image:
+                    filtered_csv_data.append(item)
+                else:
+                    # We keep track of the entries that don't have any corresponding image
+                    no_image_rows.append(item)
+            # We keep track of the images that don't have any corresponding entry
+            for image in image_list:
+                if image not in assigned_images:
+                    no_data_images.append(image)
+
+            print('## found ' + str(len(filtered_csv_data))+' items with at least one image')
+            print('# Importing data into Iconolab')
+            if options.get('collection_json'):
+                print('## Loading collection json')
+                collection = Collection.objects.create(
+                    name = collection_data.get('name'),
+                    verbose_name = collection_data.get('verbose_name', ''),
+                    description = collection_data.get('description', ''),
+                    image = collection_data.get('image', ''),
+                    height = collection_data.get('height', 0),
+                    width = collection_data.get('width', 0),
+                )
+                if collection.image:
+                    collection_image_path = os.path.join(settings.MEDIA_ROOT, str(collection.image))
+                    if not os.path.isfile(collection_image_path):
+                        print('### Moving collection image')
+                        _ , collection_image_name = os.path.split(collection_image_path)
+                        try:
+                            col_im = ImagePIL.open(os.path.join(source_dir, collection_image_name))
+                            print('##### Generating or copying jpeg for '+collection_image_name)
+                            col_im.thumbnail(col_im.size)
+                            col_im.save(collection_image_path, 'JPEG', quality=options.get('jpeg_quality', settings.IMG_JPG_DEFAULT_QUALITY))
+                        except Exception as e:
+                            print(e)
+            if options.get('metacategories_json'):
+                for metacategory in metacategories_data:
+                    MetaCategory.objects.create(
+                        collection = collection,
+                        label = metacategory.get('label'),
+                        triggers_notifications = metacategory.get('triggers_notifications', 0)
+                    )
+            print('## Converting image and moving it to static dir, creating Image and Item objects')
+            target_dir = os.path.join(settings.MEDIA_ROOT, 'uploads')
+            print('### Images will be stored in '+target_dir)
+            for item in filtered_csv_data:
+                print('#### Computing metadatas for item '+item['REF']+' (natural key)')
+                if not item['REF']:
+                    print('#### No Natural key, skipping')
+                    continue
+                item_authors = item['AUTR']
+                item_school = item['ECOLE']
+                item_designation = ''
+                if item.get('TITR', ''):
+                    item_designation = item['TITR']
+                elif item.get('DENO', ''):
+                    item_designation = item['DENO']
+                elif item.get('APPL', ''):
+                    item_designation = item['APPL']
+                item_datation = ''
+                if item.get('PERI', ''):
+                    item_datation = item['PERI']
+                elif item.get('MILL', ''):
+                    item_datation = item['MILL']
+                elif item.get('EPOQ', ''):
+                    item_datation = item['EPOQ']
+                item_technics = item['TECH']
+                item_field = item['DOM']
+                item_measurements = item['DIMS']
+                item_create_or_usage_location = item['LIEUX']
+                item_discovery_context = item['DECV']
+                item_conservation_location = item['LOCA']
+                item_photo_credits = item['PHOT']
+                item_inventory_number = item['INV']
+                item_joconde_ref = item['REF']
+                if ItemMetadata.objects.filter(item__collection = collection, natural_key = item_joconde_ref).exists():
+                    print('#### An item with '+item['REF']+' for natural key, already exists in database in the import collection')
+
+                    if options['import_folders']:
+
+                        # Extract folder name from natural key
+                        m = re.search(options['folders_regexp'], item[options['folders_metadata']])
+                        folder_id = m.group(1)
+
+                        if not Folder.objects.filter(original_id=folder_id).exists():
+                            print('#### Creating folder "'+folder_id+'"')
+                            folder = Folder.objects.create(
+                                collection = collection,
+                                name = 'Dossier '+folder_id,
+                                original_id = folder_id
+                            )
+                        else:
+                            print('#### Folder "'+folder_id+'" already exists')
+                            folder = Folder.objects.get(original_id=folder_id)
+
+                        item_metadata = ItemMetadata.objects.get(item__collection = collection, natural_key = item_joconde_ref)
+                        item = item_metadata.item
+
+                        item.folders.add(folder)
+
+                else:
+                    print('#### Creating item '+item['REF']+' (natural key) in database')
+                    item_object = Item.objects.create(
+                        collection = collection
+                    )
+
+                    new_metadata = {
+                        "authors" : item_authors,
+                        "school" : item_school,
+                        "designation" : item_designation,
+                        "field" : item_field,
+                        "datation" : item_datation,
+                        "technics" : item_technics,
+                        "measurements" : item_measurements,
+                        "create_or_usage_location" : item_create_or_usage_location,
+                        "discovery_context" : item_discovery_context,
+                        "conservation_location" : item_conservation_location,
+                        "photo_credits" : item_photo_credits,
+                        "inventory_number" : item_inventory_number,
+                        "joconde_ref" : item_joconde_ref
+                    }
+                    ItemMetadata.objects.create(
+                        item = item_object,
+                        metadata = json.dumps(new_metadata),
+                        natural_key = item_joconde_ref
+                        )
+
+                    print('#### Computing item image(s)')
+                    for image in item['SRC_IMG_FILES']:
+                        (image_name, ext) = os.path.splitext(image)
+                        if options.get('no-jpg-conversion') or ext in settings.NO_IMG_CONVERSION_EXTS:
+                            print('##### Copying file '+str(image)+' without converting')
+                            image_path = os.path.join(target_dir, image)
+                            new_image_name = image
+                            shutil.copy(os.path.join(source_dir, image), target_dir)
+                            try:
+                                im = ImagePIL.open(os.path.join(target_dir, image))
+                                im_width, im_height = im.size
+                            except Exception as e:
+                                print(e)
+                                continue
+                        else:
+                            image_path = os.path.join(target_dir, image_name) + '.jpg'
+                            new_image_name = image_name+'.jpg'
+                            if os.path.isfile(image_path):
+                                print('##### A jpeg file already exists in target dir for '+ image)
+                                try:
+                                    im = ImagePIL.open(image_path)
+                                    im_width, im_height = im.size
+                                except Exception as e:
+                                    print(e)
+                                    continue
+                            else:
+                                jpeg_img_path = image_path
+                                try:
+                                    im = ImagePIL.open(os.path.join(source_dir, image))
+                                    print('##### Generating or copying jpeg for '+image)
+                                    im.thumbnail(im.size)
+                                    im.save(jpeg_img_path, 'JPEG', quality=options.get('jpeg_quality', settings.IMG_JPG_DEFAULT_QUALITY))
+                                    im_width, im_height = im.size
+                                except Exception as e:
+                                    print(e)
+                                    continue
+                        new_image = Image.objects.create(
+                            item = item_object,
+                            media = 'uploads/'+new_image_name,
+                            name = new_image_name,
+                            height = im_height,
+                            width = im_width
+                        )
+                        ImageStats.objects.create(
+                            image = new_image
+                        )
+                    print('### Generating thumbnails for item '+item['REF'])
+                    for image in item_object.images.all():
+                        for size in settings.PREGENERATE_THUMBNAILS_SIZES:
+                            print('#### Thumbnail for size '+size)
+                            get_thumbnail(image.media, size, crop=False)
+
+            print('# All done!')
+
+            logger.debug('# Recap for import command: ')
+            print('# Images without data: ')
+            logger.debug('## Checking images left without data')
+            collection_image_file = os.path.split(str(collection.image))[1]
+            if no_data_images and collection_image_file in no_data_images:
+                no_data_images.remove(collection_image_file)
+
+            if no_data_images:
+                for image in no_data_images:
+                    logger.debug('### %r', image)
+                    print('## '+image)
+            else:
+                print('## Each image has one corresponding row!')
+                logger.debug('### Each image has one corresponding row!')
+            print('# CSV Items without image')
+            logger.debug('## Checking csv rows left without image')
+            if no_image_rows:
+                for item in no_image_rows:
+                    logger.debug('### %r', item['REF'])
+                    print('## Natural key: '+item['REF'])
+            else:
+                print('## Each row found at least one corresponding image!')
+                logger.debug('### Each row found at least one corresponding image!')
+            print('# Duplicate rows in csv')
+            logger.debug('## Checking duplicate rows in csv')
+            if duplicate_rows:
+                for item in no_image_rows:
+                    logger.debug('### %r: %r', options.get('img_filename_identifier'), item[options.get('img_filename_identifier')])
+                    print('## '+options.get('img_filename_identifier')+': '+item[options.get('img_filename_identifier')])
+            else:
+                print('## Each row found at least one corresponding image!')
+                logger.debug('### Each row found at least one corresponding image!')
+        except FileNotFoundError:
+            print('!!! File '+options.get('csv_path')+' does not exist. !!!')
+        except ValueError as e:
+            print(str(e))