src/iconolab_mcc/management/commands/importimages.py
author ymh <ymh.work@gmail.com>
Sat, 23 Jun 2018 02:04:20 +0200
changeset 14 bff393b23a68
parent 11 93228a694ce7
child 21 631f70f55fed
permissions -rw-r--r--
change natural key definition

# -*- coding: UTF-8 -*-
import csv
import json
import logging
import os
import pprint
import re
import shutil

from django.conf import settings
from django.core.management.base import BaseCommand, CommandError
from PIL import Image as ImagePIL
from sorl.thumbnail import get_thumbnail

from iconolab.management.commands.importimages import BaseImportImagesCommand
from iconolab.models import (Collection, Folder, Image, ImageStats, Item,
                             ItemMetadata, MetaCategory)

if settings.IMPORT_LOGGER_NAME and settings.LOGGING['loggers'].get(settings.IMPORT_LOGGER_NAME, ''):
    logger = logging.getLogger(settings.IMPORT_LOGGER_NAME)
else:
    logger = logging.getLogger(__name__)


class Command(BaseImportImagesCommand):
    help = 'import images from a directory into the media folder and creates item and image objects'

    def add_arguments(self, parser):
        parser.add_argument('csv_path')
        parser.add_argument(
            '--jpeg-quality',
            dest='jpeg_quality',
            default=settings.IMG_JPG_DEFAULT_QUALITY,
            help='Jpeg default quality'

        )
        parser.add_argument(
            '--encoding',
            dest='encoding',
            default='utf-8',
            help='CSV file encoding'

        )
        parser.add_argument(
            '--collection-json',
            dest='collection_json',
            default=False,
            help='creates a new collection from a json file, must be an object with fields : ' +
                 '"name" (identifier), ' +
                 '"verbose_name" (proper title name), ' +
                 '"description" (description on homepage, html is supported), ' +
                 '"image" (image on homepages, must be "uploads/<imgname>"), ' +
                 '"height" and "width" (height and width of the image)',
        )
        parser.add_argument(
            '--collection-id',
            dest='collection_id',
            default=False,
            help='insert extracted data into the specified collection instead of trying to load a collection fixture',
        )
        parser.add_argument(
            '--metacategories-json',
            dest='metacategories_json',
            default=False,
            help='add metacategories to the collection from a json file (json must be a list of object with "label" and "triggers_notifications" fields)',
        )
        parser.add_argument(
            '--delimiter',
            dest='csv_delimiter',
            default=';',
            help='csv file delimiter'
        )
        parser.add_argument(
            '--no-jpg-conversion',
            dest='no-jpg-conversion',
            default=False,
            help='use this option if you only want the image copied and not converted'
        )
        parser.add_argument(
            '--img-filename-identifier',
            dest='img_filename_identifier',
            default=settings.IMPORT_DEFAULT_FIELD_TO_FILENAME_IDENTIFIER,
            help='codename of the csv field we\'ll try to match to find the related image to a given object'
        )
        parser.add_argument(
            '--filename-regexp-prefix',
            dest='filename_regexp_prefix',
            default=r'.*',
            help='regexp prefix to properly parse image names with info from csv. The pattern should describe the part before the filename identifier string, default is .*'
        )
        parser.add_argument(
            '--filename-regexp-suffix',
            dest='filename_regexp_suffix',
            default=r'[\.\-_].*',
            help='regexp suffix to properly parse image names with info from csv. The pattern should describe the part after the filename identifier string, default is [\.\-_].*'
        )
        parser.add_argument(
            '--folders',
            dest='import_folders',
            default=False,
            action='store_const',
            const=True,
            help='option to create folders'
        )
        parser.add_argument(
            '--folders-regexp',
            dest='folders_regexp',
            default=False,
            help='regexp used to extract the folder name/number'
        )
        parser.add_argument(
            '--folders-metadata',
            dest='folders_metadata',
            default='REF',
            help='metadata from which to extract the folder name/number'
        )

    def handle(self, *args, **options):
        """
            Step-by-step for import:

            1) Argument checks for file existence and database state to check that everything can proceed without issue before reading the files
            1) We import data from csv in a 'pivot' list of dicts 'cleaned_row_data' with the following logic:
                * in the settings, there is value "IMPORT_FIELDS_DICT" that is a dict where each key is an identifier for the metadatas
                to which we associate a list of column header that will identified as that metadata
                * The cleaned_row_data list will associate the identifier with the actual value for its related column
            2) Once we have cleaned_row_data, we filter out rows that don't have any associated image into a 'filtered_row_data' list, and add a key "SRC_IMG_FILES" that contains the list of images associated
            to each row for the filtered data.
            3) At this point we have a list of all the items that will be created into the database and the related images to import, so we create the collection object if necessary
            4) For each item:
                We create the object in the database
                * Metadatas are extracted from the filtered_csv_data using the pivot identifiers from settings.IMPORT_FIELD_DICT
                We copy/convert the image into the MEDIA_ROOT/uploads/ dir: thumbnails size listed in settings.PREGENERATE_THUMBNAIL_SIZES are pre-generated for each image

            Note: each unused row and each unused image in the import folder is kept track of in no_data_images, no_image_rows and duplicate_rows lists and logged at the end of the command.
        """
        try:
            print('# Logging with logger '+logger.name)
            logger.debug('# Initializing command with args: %r', options)
            # Check we have a collection to store data into:
            self.source_dir = os.path.dirname(
                os.path.realpath(options.get('csv_path')))
            print('# Checking collection args')
            if options.get('collection_json'):
                print('## Finding collection json data in '+self.source_dir)
                collection_json_path = os.path.join(
                    self.source_dir, options.get('collection_json'))
                if not os.path.isfile(collection_json_path):
                    print('### No '+options.get('collection_json') +
                          '.json file was found in the source directory')
                    raise ValueError('!!! Json file ' +
                                     collection_json_path+' was not found !!!')
                try:
                    with open(collection_json_path) as json_fixture_file:
                        collection_data = json.loads(json_fixture_file.read())
                        for key in ['name', 'verbose_name', 'description', 'image', 'height', 'width']:
                            if not key in collection_data.keys():
                                print('!!! Json file '+collection_json_path +
                                      ' has no '+key+' field !!!')
                                raise ValueError()
                        if not collection_data.get('name', ''):
                            print('!!! Collection data key "name" is empty')
                            raise ValueError()
                        if Collection.objects.filter(name=collection_data.get('name')).exists():
                            print(
                                '!!! A Collection with the provided name already exists!')
                            raise ValueError()
                        if collection_data.get('image', '') and not (collection_data.get('width', 0) and collection_data.get('height', 0)):
                            print(
                                '!!! Collection data has an image but no height and width')
                            raise ValueError()
                except ValueError as e:
                    raise ValueError('!!! JSON Data is invalid. !!!')
            elif options.get('collection_id'):
                print('## Finding collection with id ' +
                      options.get('collection_id'))
                try:
                    collection = Collection.objects.get(
                        pk=options.get('collection_id'))
                except Collection.DoesNotExist:
                    raise ValueError('!!! Collection with primary key ' +
                                     options.get('collection_id')+' was not found, aborting !!!')
            else:
                raise ValueError(
                    '!!! No collection fixture or collection id, aborting because we can\'t properly generate data. !!!')

            if options.get('metacategories_json'):
                print('## Finding metacategories fixture json data in '+self.source_dir)
                metacategories_json_path = os.path.join(
                    self.source_dir, options.get('metacategories_json'))
                if not os.path.isfile(metacategories_json_path):
                    print('### No '+options.get('metacategories_json') +
                          '.json file was found in the source directory')
                    raise ValueError(
                        '!!! Fixture file '+metacategories_json_path+' was not found !!!')
                with open(metacategories_json_path) as metacategories_json_file:
                    metacategories_data = json.loads(
                        metacategories_json_file.read())
                    for metacategory in metacategories_data:
                        if metacategory.get('label', None) is None:
                            raise ValueError(
                                '!!! Metacategory without label !!!')

            if options['import_folders'] and not options['folders_regexp']:
                raise ValueError(
                    '!!! No regexp specified to extract folder name !!!')

            # We read the csv
            delimiter = options.get('csv_delimiter')
            if delimiter == '#9':
                delimiter = chr(9)
            if delimiter == '#29':
                delimiter = chr(29)
            if delimiter == '#30':
                delimiter = chr(30)
            if delimiter == '#31':
                delimiter = chr(31)
            csvreader = csv.DictReader(open(options.get(
                'csv_path'), encoding=options.get('encoding')), delimiter=delimiter)
            print('# Extracting data from csv file and storing it in standardized format')
            # We store data using the Jocondelab keys, as defined in settings.IMPORT_FIELDS_DICT
            cleaned_csv_data = []
            duplicate_rows = []
            for row in csvreader:
                cleaned_row_data = {}
                for key in settings.IMPORT_FIELDS_DICT.keys():
                    cleaned_row_data[key] = ''
                    for row_key in row.keys():
                        if row_key in settings.IMPORT_FIELDS_DICT[key]:
                            if key == 'REF':
                                ref_number, _, _ = row[row_key].partition(';')
                                cleaned_row_data[key] = ref_number.rstrip()
                            else:
                                cleaned_row_data[key] = row[row_key]
                            break
                if cleaned_row_data[options.get('img_filename_identifier')] in [row[options.get('img_filename_identifier')] for row in cleaned_csv_data]:
                    print("## We already have "+options.get('img_filename_identifier')+" value " +
                          cleaned_row_data[options.get('img_filename_identifier')]+" in the data to import, ignoring duplicate line")
                    duplicate_rows.append(cleaned_row_data)
                else:
                    cleaned_csv_data.append(cleaned_row_data)
            # Listing image files in csv directory
            image_list = [
                f for f in os.listdir(self.source_dir)
                if os.path.isfile(os.path.join(self.source_dir, f))
                and (f.endswith('.jpg') or f.endswith('.tif') or f.endswith('.bmp') or f.endswith('.png'))
            ]  # Maybe check if image another way
            filtered_csv_data = []
            no_image_rows = []
            no_data_images = []
            assigned_images = []
            # Now we trim the cleaned_csv_data dict to keep only entries that have at least one image
            for item in cleaned_csv_data:
                item['SRC_IMG_FILES'] = []
                has_image = False
                for image in image_list:
                    img_name_pattern = options.get('filename_regexp_prefix')+re.escape(
                        item[options.get('img_filename_identifier')])+options.get('filename_regexp_suffix')
                    if re.match(img_name_pattern, image):
                        item['SRC_IMG_FILES'].append(image)
                        assigned_images.append(image)
                        has_image = True
                if has_image:
                    filtered_csv_data.append(item)
                else:
                    # We keep track of the entries that don't have any corresponding image
                    no_image_rows.append(item)
            # We keep track of the images that don't have any corresponding entry
            for image in image_list:
                if image not in assigned_images:
                    no_data_images.append(image)

            print('## found ' + str(len(filtered_csv_data)) +
                  ' items with at least one image')
            print('# Importing data into Iconolab')
            if options.get('collection_json'):
                print('## Loading collection json')
                collection = Collection.objects.create(
                    name=collection_data.get('name'),
                    verbose_name=collection_data.get('verbose_name', ''),
                    description=collection_data.get('description', ''),
                    image=collection_data.get('image', ''),
                    height=collection_data.get('height', 0),
                    width=collection_data.get('width', 0),
                )
                if collection.image:
                    collection_image_path = os.path.join(
                        settings.MEDIA_ROOT, str(collection.image))
                    if not os.path.isfile(collection_image_path):
                        print('### Moving collection image')
                        _, collection_image_name = os.path.split(
                            collection_image_path)
                        try:
                            col_im = ImagePIL.open(os.path.join(
                                self.source_dir, collection_image_name))
                            print('##### Generating or copying jpeg for ' +
                                  collection_image_name)
                            col_im.thumbnail(col_im.size)
                            col_im.save(collection_image_path, 'JPEG', quality=options.get(
                                'jpeg_quality', settings.IMG_JPG_DEFAULT_QUALITY))
                        except Exception as e:
                            print(e)
            if options.get('metacategories_json'):
                for metacategory in metacategories_data:
                    MetaCategory.objects.create(
                        collection=collection,
                        label=metacategory.get('label'),
                        triggers_notifications=metacategory.get(
                            'triggers_notifications', 0)
                    )
            print(
                '## Converting image and moving it to static dir, creating Image and Item objects')
            target_dir = os.path.join(settings.MEDIA_ROOT, Image.media.field.upload_to)
            print('### Images will be stored in ' + target_dir)
            for item in filtered_csv_data:
                print('#### Computing metadatas for item ' +
                      item['REF']+' (natural key)')
                if not item['REF']:
                    print('#### No Natural key, skipping')
                    continue
                item_authors = item['AUTR']
                item_school = item['ECOLE']
                item_designation = ''
                if item.get('TITR', ''):
                    item_designation = item['TITR']
                elif item.get('DENO', ''):
                    item_designation = item['DENO']
                elif item.get('APPL', ''):
                    item_designation = item['APPL']
                item_datation = ''
                if item.get('PERI', ''):
                    item_datation = item['PERI']
                elif item.get('MILL', ''):
                    item_datation = item['MILL']
                elif item.get('EPOQ', ''):
                    item_datation = item['EPOQ']
                item_technics = item['TECH']
                item_field = item['DOM']
                item_measurements = item['DIMS']
                item_create_or_usage_location = item['LIEUX']
                item_discovery_context = item['DECV']
                item_conservation_location = item['LOCA']
                item_photo_credits = item['PHOT']
                item_inventory_number = item['INV']
                item_joconde_ref = item['REF']
                natural_key="%s|%s" % (collection.name,item_joconde_ref)
                if ItemMetadata.objects.filter(item__collection=collection, natural_key=natural_key).exists():
                    print('#### An item with ' +
                          item['REF']+' for natural key, already exists in database in the import collection')

                    if options['import_folders']:

                        # Extract folder name from natural key
                        m = re.search(
                            options['folders_regexp'], item[options['folders_metadata']])
                        folder_id = m.group(1)

                        if not Folder.objects.filter(original_id=folder_id).exists():
                            print('#### Creating folder "'+folder_id+'"')
                            folder = Folder.objects.create(
                                collection=collection,
                                name='Dossier '+folder_id,
                                original_id=folder_id
                            )
                        else:
                            print('#### Folder "'+folder_id+'" already exists')
                            folder = Folder.objects.get(original_id=folder_id)

                        item_metadata = ItemMetadata.objects.get(
                            item__collection=collection, natural_key=natural_key)
                        item = item_metadata.item

                        item.folders.add(folder)

                else:
                    new_metadata = {
                        "authors" : item_authors,
                        "school" : item_school,
                        "designation" : item_designation,
                        "field" : item_field,
                        "datation" : item_datation,
                        "technics" : item_technics,
                        "measurements" : item_measurements,
                        "create_or_usage_location" : item_create_or_usage_location,
                        "discovery_context" : item_discovery_context,
                        "conservation_location" : item_conservation_location,
                        "photo_credits" : item_photo_credits,
                        "inventory_number" : item_inventory_number,
                        "joconde_ref" : item_joconde_ref
                    }
                    self.create_item_and_metadata(
                        natural_key, collection, new_metadata, item['SRC_IMG_FILES'], options, self.source_dir)

            print('# All done!')

            logger.debug('# Recap for import command: ')
            print('# Images without data: ')
            logger.debug('## Checking images left without data')
            collection_image_file = os.path.split(str(collection.image))[1]
            if no_data_images and collection_image_file in no_data_images:
                no_data_images.remove(collection_image_file)

            if no_data_images:
                for image in no_data_images:
                    logger.debug('### %r', image)
                    print('## '+image)
            else:
                print('## Each image has one corresponding row!')
                logger.debug('### Each image has one corresponding row!')
            print('# CSV Items without image')
            logger.debug('## Checking csv rows left without image')
            if no_image_rows:
                for item in no_image_rows:
                    logger.debug('### %r', item['REF'])
                    print('## Natural key: '+item['REF'])
            else:
                print('## Each row found at least one corresponding image!')
                logger.debug(
                    '### Each row found at least one corresponding image!')
            print('# Duplicate rows in csv')
            logger.debug('## Checking duplicate rows in csv')
            if duplicate_rows:
                for item in no_image_rows:
                    logger.debug('### %r: %r', options.get(
                        'img_filename_identifier'), item[options.get('img_filename_identifier')])
                    print('## '+options.get('img_filename_identifier') +
                          ': '+item[options.get('img_filename_identifier')])
            else:
                print('## Each row found at least one corresponding image!')
                logger.debug(
                    '### Each row found at least one corresponding image!')
        except FileNotFoundError:
            print('!!! File '+options.get('csv_path')+' does not exist. !!!')
        except ValueError as e:
            print(str(e))