src/iconolab_mcc/management/commands/importimages.py
author Riwad Salim
Tue, 12 Jun 2018 13:11:49 +0200
changeset 5 cfd40849d24c
child 7 023dbfdc9f19
permissions -rw-r--r--
Turning iconolab-mcc into App to add specific import commands

# -*- coding: UTF-8 -*-
from django.core.management.base import BaseCommand, CommandError
from django.core.management import call_command
from django.conf import settings
from iconolab.models import Collection, Image, ImageStats, Item, ItemMetadata, MetaCategory, Folder
from PIL import Image as ImagePIL
from sorl.thumbnail import get_thumbnail
import os, csv, pprint, re, json, shutil, logging

if settings.IMPORT_LOGGER_NAME and settings.LOGGING['loggers'].get(settings.IMPORT_LOGGER_NAME, ''):
    logger = logging.getLogger(settings.IMPORT_LOGGER_NAME)
else:
    logger = logging.getLogger(__name__)

class Command(BaseCommand):
    help = 'import images from a directory into the media folder and creates item and image objects'

    def add_arguments(self, parser):
        parser.add_argument('csv_path')
        parser.add_argument(
            '--jpeg-quality',
            dest='jpeg_quality',
            default=settings.IMG_JPG_DEFAULT_QUALITY,
            help='Jpeg default quality'

        )
        parser.add_argument(
            '--encoding',
            dest='encoding',
            default='utf-8',
            help='CSV file encoding'

        )
        parser.add_argument(
            '--collection-json',
            dest='collection_json',
            default=False,
            help='creates a new collection from a json file, must be an object with fields : '+ \
                 '"name" (identifier), '+ \
                 '"verbose_name" (proper title name), '+ \
                 '"description" (description on homepage, html is supported), '+ \
                 '"image" (image on homepages, must be "uploads/<imgname>"), '+ \
                 '"height" and "width" (height and width of the image)',
        )
        parser.add_argument(
            '--collection-id',
            dest='collection_id',
            default=False,
            help='insert extracted data into the specified collection instead of trying to load a collection fixture',
        )
        parser.add_argument(
            '--metacategories-json',
            dest='metacategories_json',
            default=False,
            help='add metacategories to the collection from a json file (json must be a list of object with "label" and "triggers_notifications" fields)',
        )
        parser.add_argument(
            '--delimiter',
            dest='csv_delimiter',
            default=';',
            help='csv file delimiter'
        )
        parser.add_argument(
            '--no-jpg-conversion',
            dest='no-jpg-conversion',
            default=False,
            help='use this option if you only want the image copied and not converted'
        )
        parser.add_argument(
            '--img-filename-identifier',
            dest='img_filename_identifier',
            default=settings.IMPORT_DEFAULT_FIELD_TO_FILENAME_IDENTIFIER,
            help='codename of the csv field we\'ll try to match to find the related image to a given object'
        )
        parser.add_argument(
            '--filename-regexp-prefix',
            dest='filename_regexp_prefix',
            default=r'.*',
            help='regexp prefix to properly parse image names with info from csv. The pattern should describe the part before the filename identifier string, default is .*'
        )
        parser.add_argument(
            '--filename-regexp-suffix',
            dest='filename_regexp_suffix',
            default=r'[\.\-_].*',
            help='regexp suffix to properly parse image names with info from csv. The pattern should describe the part after the filename identifier string, default is [\.\-_].*'
        )
        parser.add_argument(
            '--folders',
            dest='import_folders',
            default=False,
            action='store_const',
            const=True,
            help='option to create folders'
        )
        parser.add_argument(
            '--folders-regexp',
            dest='folders_regexp',
            default=False,
            help='regexp used to extract the folder name/number'
        )
        parser.add_argument(
            '--folders-metadata',
            dest='folders_metadata',
            default='REF',
            help='metadata from which to extract the folder name/number'
        )
    def handle(self, *args, **options):
        """
            Step-by-step for import:

            1) Argument checks for file existence and database state to check that everything can proceed without issue before reading the files
            1) We import data from csv in a 'pivot' list of dicts 'cleaned_row_data' with the following logic:
                * in the settings, there is value "IMPORT_FIELDS_DICT" that is a dict where each key is an identifier for the metadatas
                to which we associate a list of column header that will identified as that metadata
                * The cleaned_row_data list will associate the identifier with the actual value for its related column
            2) Once we have cleaned_row_data, we filter out rows that don't have any associated image into a 'filtered_row_data' list, and add a key "SRC_IMG_FILES" that contains the list of images associated
            to each row for the filtered data.
            3) At this point we have a list of all the items that will be created into the database and the related images to import, so we create the collection object if necessary
            4) For each item:
                We create the object in the database
                * Metadatas are extracted from the filtered_csv_data using the pivot identifiers from settings.IMPORT_FIELD_DICT
                We copy/convert the image into the MEDIA_ROOT/uploads/ dir: thumbnails size listed in settings.PREGENERATE_THUMBNAIL_SIZES are pre-generated for each image

            Note: each unused row and each unused image in the import folder is kept track of in no_data_images, no_image_rows and duplicate_rows lists and logged at the end of the command.
        """
        try:
            print('# Logging with logger '+logger.name)
            logger.debug('# Initializing command with args: %r', options)
            # Check we have a collection to store data into:
            source_dir = os.path.dirname(os.path.realpath(options.get('csv_path')))
            print('# Checking collection args')
            if options.get('collection_json'):
                print('## Finding collection json data in '+source_dir)
                collection_json_path = os.path.join(source_dir, options.get('collection_json'))
                if not os.path.isfile(collection_json_path):
                    print('### No '+options.get('collection_json')+'.json file was found in the source directory')
                    raise ValueError('!!! Json file '+collection_json_path+' was not found !!!')
                try:
                    with open(collection_json_path) as json_fixture_file:
                        collection_data = json.loads(json_fixture_file.read())
                        for key in ['name', 'verbose_name', 'description', 'image', 'height', 'width']:
                            if not key in collection_data.keys():
                                print('!!! Json file '+collection_json_path+' has no '+key+' field !!!')
                                raise ValueError()
                        if not collection_data.get('name', ''):
                            print('!!! Collection data key "name" is empty')
                            raise ValueError()
                        if Collection.objects.filter(name=collection_data.get('name')).exists():
                            print('!!! A Collection with the provided name already exists!')
                            raise ValueError()
                        if collection_data.get('image', '') and not (collection_data.get('width', 0) and collection_data.get('height', 0)):
                            print('!!! Collection data has an image but no height and width')
                            raise ValueError()
                except ValueError as e:
                    raise ValueError('!!! JSON Data is invalid. !!!')
            elif options.get('collection_id'):
                print('## Finding collection with id '+options.get('collection_id'))
                try:
                    collection = Collection.objects.get(pk=options.get('collection_id'))
                except Collection.DoesNotExist:
                    raise ValueError('!!! Collection with primary key '+options.get('collection_id')+' was not found, aborting !!!')
            else:
                raise ValueError('!!! No collection fixture or collection id, aborting because we can\'t properly generate data. !!!')

            if options.get('metacategories_json'):
                print('## Finding metacategories fixture json data in '+source_dir)
                metacategories_json_path = os.path.join(source_dir, options.get('metacategories_json'))
                if not os.path.isfile(metacategories_json_path):
                    print('### No '+options.get('metacategories_json')+'.json file was found in the source directory')
                    raise ValueError('!!! Fixture file '+metacategories_json_path+' was not found !!!')
                with open(metacategories_json_path) as metacategories_json_file:
                    metacategories_data = json.loads(metacategories_json_file.read())
                    for metacategory in metacategories_data:
                        if metacategory.get('label', None) is None:
                            raise ValueError('!!! Metacategory without label !!!')

            if options['import_folders'] and not options['folders_regexp']:
                raise ValueError('!!! No regexp specified to extract folder name !!!')

            # We read the csv
            delimiter = options.get('csv_delimiter')
            if delimiter == '#9':
                delimiter = chr(9)
            if delimiter == '#29':
                delimiter = chr(29)
            if delimiter == '#30':
                delimiter = chr(30)
            if delimiter == '#31':
                delimiter = chr(31)
            csvreader = csv.DictReader(open(options.get('csv_path'), encoding=options.get('encoding')), delimiter=delimiter)
            print('# Extracting data from csv file and storing it in standardized format')
            # We store data using the Jocondelab keys, as defined in settings.IMPORT_FIELDS_DICT
            cleaned_csv_data=[]
            duplicate_rows=[]
            for row in csvreader:
                cleaned_row_data = {}
                for key in settings.IMPORT_FIELDS_DICT.keys():
                    cleaned_row_data[key] = ''
                    for row_key in row.keys():
                        if row_key in settings.IMPORT_FIELDS_DICT[key]:
                            if key == 'REF':
                                ref_number, _, _ = row[row_key].partition(';')
                                cleaned_row_data[key] = ref_number.rstrip()
                            else:
                                cleaned_row_data[key] = row[row_key]
                            break
                if cleaned_row_data[options.get('img_filename_identifier')] in [row[options.get('img_filename_identifier')] for row in cleaned_csv_data]:
                    print("## We already have "+options.get('img_filename_identifier')+" value "+cleaned_row_data[options.get('img_filename_identifier')]+" in the data to import, ignoring duplicate line")
                    duplicate_rows.append(cleaned_row_data)
                else:
                    cleaned_csv_data.append(cleaned_row_data)
            # Listing image files in csv directory
            image_list = [
                f for f in os.listdir(source_dir)
                if os.path.isfile(os.path.join(source_dir, f))
                and (f.endswith('.jpg') or f.endswith('.tif') or f.endswith('.bmp') or f.endswith('.png'))
            ] # Maybe check if image another way
            filtered_csv_data = []
            no_image_rows = []
            no_data_images = []
            assigned_images = []
            # Now we trim the cleaned_csv_data dict to keep only entries that have at least one image
            for item in cleaned_csv_data:
                item['SRC_IMG_FILES'] = []
                has_image = False
                for image in image_list:
                    img_name_pattern = options.get('filename_regexp_prefix')+re.escape(item[options.get('img_filename_identifier')])+options.get('filename_regexp_suffix')
                    if re.match(img_name_pattern, image):
                        item['SRC_IMG_FILES'].append(image)
                        assigned_images.append(image)
                        has_image = True
                if has_image:
                    filtered_csv_data.append(item)
                else:
                    # We keep track of the entries that don't have any corresponding image
                    no_image_rows.append(item)
            # We keep track of the images that don't have any corresponding entry
            for image in image_list:
                if image not in assigned_images:
                    no_data_images.append(image)

            print('## found ' + str(len(filtered_csv_data))+' items with at least one image')
            print('# Importing data into Iconolab')
            if options.get('collection_json'):
                print('## Loading collection json')
                collection = Collection.objects.create(
                    name = collection_data.get('name'),
                    verbose_name = collection_data.get('verbose_name', ''),
                    description = collection_data.get('description', ''),
                    image = collection_data.get('image', ''),
                    height = collection_data.get('height', 0),
                    width = collection_data.get('width', 0),
                )
                if collection.image:
                    collection_image_path = os.path.join(settings.MEDIA_ROOT, str(collection.image))
                    if not os.path.isfile(collection_image_path):
                        print('### Moving collection image')
                        _ , collection_image_name = os.path.split(collection_image_path)
                        try:
                            col_im = ImagePIL.open(os.path.join(source_dir, collection_image_name))
                            print('##### Generating or copying jpeg for '+collection_image_name)
                            col_im.thumbnail(col_im.size)
                            col_im.save(collection_image_path, 'JPEG', quality=options.get('jpeg_quality', settings.IMG_JPG_DEFAULT_QUALITY))
                        except Exception as e:
                            print(e)
            if options.get('metacategories_json'):
                for metacategory in metacategories_data:
                    MetaCategory.objects.create(
                        collection = collection,
                        label = metacategory.get('label'),
                        triggers_notifications = metacategory.get('triggers_notifications', 0)
                    )
            print('## Converting image and moving it to static dir, creating Image and Item objects')
            target_dir = os.path.join(settings.MEDIA_ROOT, 'uploads')
            print('### Images will be stored in '+target_dir)
            for item in filtered_csv_data:
                print('#### Computing metadatas for item '+item['REF']+' (natural key)')
                if not item['REF']:
                    print('#### No Natural key, skipping')
                    continue
                item_authors = item['AUTR']
                item_school = item['ECOLE']
                item_designation = ''
                if item.get('TITR', ''):
                    item_designation = item['TITR']
                elif item.get('DENO', ''):
                    item_designation = item['DENO']
                elif item.get('APPL', ''):
                    item_designation = item['APPL']
                item_datation = ''
                if item.get('PERI', ''):
                    item_datation = item['PERI']
                elif item.get('MILL', ''):
                    item_datation = item['MILL']
                elif item.get('EPOQ', ''):
                    item_datation = item['EPOQ']
                item_technics = item['TECH']
                item_field = item['DOM']
                item_measurements = item['DIMS']
                item_create_or_usage_location = item['LIEUX']
                item_discovery_context = item['DECV']
                item_conservation_location = item['LOCA']
                item_photo_credits = item['PHOT']
                item_inventory_number = item['INV']
                item_joconde_ref = item['REF']
                if ItemMetadata.objects.filter(item__collection = collection, natural_key = item_joconde_ref).exists():
                    print('#### An item with '+item['REF']+' for natural key, already exists in database in the import collection')

                    if options['import_folders']:

                        # Extract folder name from natural key
                        m = re.search(options['folders_regexp'], item[options['folders_metadata']])
                        folder_id = m.group(1)

                        if not Folder.objects.filter(original_id=folder_id).exists():
                            print('#### Creating folder "'+folder_id+'"')
                            folder = Folder.objects.create(
                                collection = collection,
                                name = 'Dossier '+folder_id,
                                original_id = folder_id
                            )
                        else:
                            print('#### Folder "'+folder_id+'" already exists')
                            folder = Folder.objects.get(original_id=folder_id)

                        item_metadata = ItemMetadata.objects.get(item__collection = collection, natural_key = item_joconde_ref)
                        item = item_metadata.item

                        item.folders.add(folder)

                else:
                    print('#### Creating item '+item['REF']+' (natural key) in database')
                    item_object = Item.objects.create(
                        collection = collection
                    )

                    new_metadata = {
                        "authors" : item_authors,
                        "school" : item_school,
                        "designation" : item_designation,
                        "field" : item_field,
                        "datation" : item_datation,
                        "technics" : item_technics,
                        "measurements" : item_measurements,
                        "create_or_usage_location" : item_create_or_usage_location,
                        "discovery_context" : item_discovery_context,
                        "conservation_location" : item_conservation_location,
                        "photo_credits" : item_photo_credits,
                        "inventory_number" : item_inventory_number,
                        "joconde_ref" : item_joconde_ref
                    }
                    ItemMetadata.objects.create(
                        item = item_object,
                        metadata = json.dumps(new_metadata),
                        natural_key = item_joconde_ref
                        )

                    print('#### Computing item image(s)')
                    for image in item['SRC_IMG_FILES']:
                        (image_name, ext) = os.path.splitext(image)
                        if options.get('no-jpg-conversion') or ext in settings.NO_IMG_CONVERSION_EXTS:
                            print('##### Copying file '+str(image)+' without converting')
                            image_path = os.path.join(target_dir, image)
                            new_image_name = image
                            shutil.copy(os.path.join(source_dir, image), target_dir)
                            try:
                                im = ImagePIL.open(os.path.join(target_dir, image))
                                im_width, im_height = im.size
                            except Exception as e:
                                print(e)
                                continue
                        else:
                            image_path = os.path.join(target_dir, image_name) + '.jpg'
                            new_image_name = image_name+'.jpg'
                            if os.path.isfile(image_path):
                                print('##### A jpeg file already exists in target dir for '+ image)
                                try:
                                    im = ImagePIL.open(image_path)
                                    im_width, im_height = im.size
                                except Exception as e:
                                    print(e)
                                    continue
                            else:
                                jpeg_img_path = image_path
                                try:
                                    im = ImagePIL.open(os.path.join(source_dir, image))
                                    print('##### Generating or copying jpeg for '+image)
                                    im.thumbnail(im.size)
                                    im.save(jpeg_img_path, 'JPEG', quality=options.get('jpeg_quality', settings.IMG_JPG_DEFAULT_QUALITY))
                                    im_width, im_height = im.size
                                except Exception as e:
                                    print(e)
                                    continue
                        new_image = Image.objects.create(
                            item = item_object,
                            media = 'uploads/'+new_image_name,
                            name = new_image_name,
                            height = im_height,
                            width = im_width
                        )
                        ImageStats.objects.create(
                            image = new_image
                        )
                    print('### Generating thumbnails for item '+item['REF'])
                    for image in item_object.images.all():
                        for size in settings.PREGENERATE_THUMBNAILS_SIZES:
                            print('#### Thumbnail for size '+size)
                            get_thumbnail(image.media, size, crop=False)

            print('# All done!')

            logger.debug('# Recap for import command: ')
            print('# Images without data: ')
            logger.debug('## Checking images left without data')
            collection_image_file = os.path.split(str(collection.image))[1]
            if no_data_images and collection_image_file in no_data_images:
                no_data_images.remove(collection_image_file)

            if no_data_images:
                for image in no_data_images:
                    logger.debug('### %r', image)
                    print('## '+image)
            else:
                print('## Each image has one corresponding row!')
                logger.debug('### Each image has one corresponding row!')
            print('# CSV Items without image')
            logger.debug('## Checking csv rows left without image')
            if no_image_rows:
                for item in no_image_rows:
                    logger.debug('### %r', item['REF'])
                    print('## Natural key: '+item['REF'])
            else:
                print('## Each row found at least one corresponding image!')
                logger.debug('### Each row found at least one corresponding image!')
            print('# Duplicate rows in csv')
            logger.debug('## Checking duplicate rows in csv')
            if duplicate_rows:
                for item in no_image_rows:
                    logger.debug('### %r: %r', options.get('img_filename_identifier'), item[options.get('img_filename_identifier')])
                    print('## '+options.get('img_filename_identifier')+': '+item[options.get('img_filename_identifier')])
            else:
                print('## Each row found at least one corresponding image!')
                logger.debug('### Each row found at least one corresponding image!')
        except FileNotFoundError:
            print('!!! File '+options.get('csv_path')+' does not exist. !!!')
        except ValueError as e:
            print(str(e))