src/egonomy/management/commands/import_rmn.py
author ymh <ymh.work@gmail.com>
Sat, 23 Feb 2013 01:58:26 +0100
changeset 64 9294f5c1a897
parent 36 src/egonomy/management/commands/importRmn.py@daa526d27044
permissions -rw-r--r--
rename import commands, check linux virtualenv creation

# -*- coding: utf-8 -*-
'''
Created on Jan 31, 2013

@author: ymh
'''

from ..utils import show_progress
from django.core.management.base import BaseCommand, CommandError
from django.conf import settings
from django.db import models, transaction
from egonomy.models import Image, ImageInfo, ImageMetadata
from optparse import make_option
import mimetypes
import csv
import decimal
import os.path
import sys
import shutil
import PIL.Image
import PIL.ExifTags
import json
import datetime


class Command(BaseCommand):
    '''
    Import rmn csv files
    '''

    args = 'csv_file csv_file ...'
    help = 'Import rmn csv files'
    
    option_list = BaseCommand.option_list + (
        make_option('--check-id',
            action= 'store_true',
            dest= 'check_id',
            default= False,
            help= 'check an image id before trying to insert it, may be a lot slower' 
        ),
        make_option('-p', '--image-path',
            dest= 'image_path',
            default= None,
            help= 'path to the root o image folder' 
        ),
        make_option('-n', '--max-lines',
            dest= 'max_lines',
            type='int',
            default= sys.maxint,
            help= 'max number of line to process, -1 process all file' 
        ),
        make_option('-b', '--batch-size',
            dest= 'batch_size',
            type='int',
            default= 5000,
            help= 'number of object to import in bulk operations' 
        ),
        make_option('-e', '--encoding',
            dest= 'encoding',
            default= 'latin1',
            help= 'csv files encoding' 
        ),
        make_option('--skip',
            dest= 'skip',
            type='int',
            default= 0,
            help= 'number of entry to skip' 
        ),
        make_option('--stop',
            dest= 'cont',
            action= 'store_false',
            default= True,
            help= 'stop on error' 
        ),
        make_option('-l', '--log',
            dest= 'log',
            default= 'log.txt',
            help= 'log file' 
        ),
    )
    
    def __safe_get(self, dict_arg, key, conv = lambda x: x, default= None):
        val = dict_arg.get(key, default)
        return conv(val) if val else default

    def __safe_decode(self, s):
        if not isinstance(s, basestring):
            return s
        try:
            return s.decode('utf8')
        except:
            try:
                return s.decode('latin1')
            except:
                return s.decode('utf8','replace')

    def handle(self, *args, **options):

        #getting path to copy images
        imageInfoModel = models.get_model('egonomy', 'ImageInfo')
        upload_to = imageInfoModel._meta.get_field_by_name('image_file')[0].upload_to
        media_root = getattr(settings, 'MEDIA_ROOT', None)
        
        if not media_root:
            raise CommandError('The setting MEDIA_ROT must be set')
        
        image_root = os.path.abspath(os.path.join(media_root, upload_to))
        
        print("Caching filenames...")
        #map filenames
        image_filemanes_map = {}
        
        root_img_dir = options.get('image_path', None)
        
        if not root_img_dir:
            raise CommandError("No image path. the -p or --image-path options is compulsory")
        
        root_img_dir = os.path.abspath(root_img_dir)
        
        for f_triple in os.walk(root_img_dir, topdown = True):
            for f in f_triple[2]:
                full_path = os.path.join(f_triple[0],f)
                rel_path = full_path[len(root_img_dir)+1:]
                image_filemanes_map[os.path.splitext(f)[0]] = (full_path, rel_path)
        #get the number of lines to process
        
        print("caching done. %d file found " % len(image_filemanes_map))
        
        max_lines = options.get('max_lines', sys.maxint)
        csv_files_dialect = {}
        skip = options.get('skip', 0)
        # calculating the number of lines to process
        print("calculating number of line to process")
        total = 0
        for csv_file_path in args:            
            with open(csv_file_path,'rb') as csv_file:
                dialect = csv.Sniffer().sniff(csv_file.read(1024))
                dialect.doublequote = True
                csv_files_dialect[csv_file_path] = dialect
                csv_file.seek(0)
                for _ in csv.DictReader(csv_file, dialect=dialect):
                    total += 1
                    if total > max_lines:
                        break
        
        nb_lines = min(max_lines, total)
        batch_size = options.get('batch_size', 5000)
        
        print("There is %d lines to process, starting processing now." % nb_lines)
        counter = 0
        writer = None
        img_objs = []
        img_objs_md = []
        img_objs_info = []
        check_id = options.get('check_id', False)
        encoding = options.get('encoding', 'latin1')
        log_path = options.get('log', "log.txt")
        cont_on_error = options.get('cont', True)

        transaction.enter_transaction_management()
        transaction.managed()
        try:        
            for csv_file_path in args:
                with open(csv_file_path,'rb') as csv_file:
                    dialect = csv_files_dialect.get(csv_file_path,None)
                    if not dialect:
                        dialect = csv.Sniffer().sniff(csv_file.read(1024))
                        dialect.doublequote = True
                        csv_file.seek(0)
                    
                    dictreader = csv.DictReader(csv_file, dialect=dialect) 
                    for row in dictreader:
                        try:
                            counter += 1
                            if counter <= skip:
                                continue
                            if counter > nb_lines:
                                break
                            urow = dict([(k, v.decode(encoding, 'replace') if v else v) for k,v in row.items()])
                            writer = show_progress(counter, nb_lines, u"%s - %s - %d/%d" % (urow['CLICHE'], urow['TITRE'], counter%batch_size, batch_size), 80, writer)
                            
                            if check_id and ImageMetadata.objects.filter(cliche=urow['CLICHE']).count():
                                raise CommandError("Duplicate entry line %d of file %s" % (dictreader.line_num, csv_file_path))
    
                            img_id = urow['CLICHE']
                            img_md_obj = ImageMetadata(
                                id = img_id,
                                cliche = img_id,
                                inventaire = self.__safe_get(urow, 'INVENTAIRE'),                            
                                titre = self.__safe_get(urow, 'TITRE'),
                                description = self.__safe_get(urow, 'DESCRIPTION'),
                                date = self.__safe_get(urow, 'DATE', int, None),
                                longueur = self.__safe_get(urow, 'LONGUEUR', decimal.Decimal, None),
                                hauteur = self.__safe_get(urow, 'HAUTEUR', decimal.Decimal, None),
                                profondeur = self.__safe_get(urow, 'PROFONDEUR', decimal.Decimal, None),
                                diametre = self.__safe_get(urow, 'DIAMETRE', decimal.Decimal, None),
                                photographe = self.__safe_get(urow, 'PHOTOGRAPE'), 
                                auteur = self.__safe_get(urow, 'AUTEUR'),
                                droits = self.__safe_get(urow, 'DROITS'),
                                mentions = self.__safe_get(urow, 'MENTIONS'),
                                periode  = self.__safe_get(urow, 'PERIODE'),
                                technique = self.__safe_get(urow, 'TECHNIQUE'),
                                site = self.__safe_get(urow, 'SITE'),
                                lieu = self.__safe_get(urow, 'LIEU'),
                                localisation = self.__safe_get(urow, 'LOCALISATION'),
                                mots_cles = self.__safe_get(urow, 'MOTS_CLES')                            
                            )                        
    
                            img_info_obj = None
                            finfo = image_filemanes_map.get(img_id, None)
                            if finfo is not None:
                                # copy file
                                img_fullpath, img_relpath = finfo
                                dest_path = os.path.join(image_root, img_relpath)
                                d = os.path.dirname(dest_path)
                                if not os.path.exists(d):
                                    os.makedirs(d)
                                shutil.copy(img_fullpath, dest_path)
                                mimestr = mimetypes.guess_type(dest_path, False)[0]
                                img = PIL.Image.open(dest_path)
                                width, height = img.size
                                raw_exif = img._getexif()
                                exif = dict((PIL.ExifTags.TAGS.get(k,k), self.__safe_decode(v)) for (k,v) in raw_exif.items()) if raw_exif else None
                                #create image info object
                                img_info_obj = ImageInfo(
                                    id = img_id,
                                    width = width,
                                    height = height,
                                    mimetype = mimestr,
                                    exif = json.dumps(exif) if exif else None
                                )
                                img_info_obj.image_file.name = os.path.join(upload_to, img_relpath)
                                
                            
                            img_obj = Image(
                                id = img_id,
                                metadata = img_md_obj,
                                info = img_info_obj
                            )
                                                    
                            img_objs_md.append(img_md_obj)
                            if img_info_obj is not None:
                                img_objs_info.append(img_info_obj)
                            img_objs.append(img_obj)
                            
                        except Exception as e:                            
                            error_msg = "%s - Error treating line %d, file %s local %d : id %s - title : %s : %s\n" % (datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),counter, csv_file_path, dictreader.line_num, row['ID'] if (row and 'ID' in row and row['ID']) else 'n/a', row['TITRE'] if (row and 'TITRE' in row and row['TITRE']) else 'n/a', repr(e) )
                            with open(log_path, 'a') as log_file:
                                log_file.write(error_msg)
                            if not cont_on_error:
                                raise
                        
                        
                        if not (counter%batch_size):
                            ImageMetadata.objects.bulk_create(img_objs_md)
                            ImageInfo.objects.bulk_create(img_objs_info)
                            Image.objects.bulk_create(img_objs)
                            img_objs = []
                            img_objs_info = []
                            img_objs_md = []
                            transaction.commit()
                            
                        
                if counter > nb_lines:
                    break
            
            if img_objs:
                ImageMetadata.objects.bulk_create(img_objs_md)
                ImageInfo.objects.bulk_create(img_objs_info)
                Image.objects.bulk_create(img_objs)
                transaction.commit()
            
                    
            no_img_req = Image.objects.filter(info=None)
            
            if no_img_req.count() > 0:
                print "WARNING : the following images have no image files :"
                for img_obj in no_img_req:
                    print "%s : %s" % (img_obj.metadata.id, img_obj.metadata.titre)
            transaction.commit()        
        except:
            transaction.rollback()            
            raise
        finally:
            transaction.leave_transaction_management()