src/hdabo/management/commands/import_rdf.py
changeset 442 3d54acec55d6
child 443 27f71b0a772d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hdabo/management/commands/import_rdf.py	Wed Dec 10 10:05:38 2014 +0100
@@ -0,0 +1,455 @@
+# -*- coding: utf-8 -*-
+'''
+Created on May 25, 2011
+
+
+Compilation: 
+
+#Install librdf
+# install raptor2 : configure --prefix=<path to venv> + make + make install
+# install librasql : PKG_CONFIG_PATH=/Users/ymh/dev/venvs/hdalab/lib/pkgconfig RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 ./configure --prefix=/Users/ymh/dev/venvs/hdalab 
+
+
+raptor2-2.0.15
+configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+rasqal-0.9.32:
+
+diff --git a/rasqal-0.9.32/configure b/rasqal-0.9.32.new/configure
+index a29a606..b1dda08 100755
+--- a/rasqal-0.9.32/configure
++++ b/rasqal-0.9.32.new/configure
+@@ -14105,7 +14105,7 @@ else
+         { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+ $as_echo "yes" >&6; }
+
+-  RAPTOR_VERSION=`$PKG_CONFIG raptor2 --modversion 2>/dev/null`
++  RAPTOR_VERSION="2.0.15" #`$PKG_CONFIG raptor2 --modversion 2>/dev/null`
+   raptor_too_old=0
+   as_arg_v1=$RAPTOR_VERSION
+ as_arg_v2=$RAPTOR_MIN_VERSION
+
+RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 ./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+sqlite:
+./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+unixODBC:
+./configure --prefix=/Users/ymh/dev/venvs/hdalab --enable-gui=no
+make
+make install
+
+redland-1.0.17:
+CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include" PKG_CONFIG_PATH=/Users/ymh/dev/venvs/hdalab/lib/pkgconfig RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 RASQAL_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lrasqal"  RASQAL_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/rasqal" ./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+redland-bindings
+CFLAGS="-L/Users/ymh/dev/venvs/hdalab/lib -I/Users/ymh/dev/venvs/hdalab/include -I/Users/ymh/dev/venvs/hdalab/include/raptor2 -I/Users/ymh/dev/venvs/hdalab/include/rasqal -lraptor2 -lrasqal -lrdf" PKG_CONFIG_PATH=/Users/ymh/dev/venvs/hdalab/lib/pkgconfig RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 RASQAL_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lrasqal"  RASQAL_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/rasqal" ./configure --prefix=/Users/ymh/dev/venvs/hdalab --with-python=/Users/ymh/dev/venvs/hdalab/bin/python --with-python-ldflags="-L/Users/ymh/dev/venvs/hdalab/lib -L/System/Library/Frameworks/Python.framework/Versions/2.7/lib -lpython2.7 -v -Wl,-dylib" --with-python-libext=.so
+
+
+---
+prerequisite:
+- pkg-config:
+./configure --prefix=/Users/ymh/dev/venvs/redland --with-internal-glib && make && make install
+
+- sqlite:
+./configure --prefix=/Users/ymh/dev/venvs/redland && make && make install
+
+unixODBC:
+./configure --prefix=/Users/ymh/dev/venvs/redland --enable-gui=no
+make
+make install
+
+raptor2-2.0.15
+configure --prefix=/Users/ymh/dev/venvs/hdalab
+make
+make install
+
+
+rasqal-0.9.32
+configure --prefix=/Users/ymh/dev/venvs/hdalab
+make
+make install
+
+redland-1.0.17
+CFLAGS="-I/Users/ymh/dev/venvs/redland/include" ./configure --prefix=/Users/ymh/dev/venvs/redland
+make
+make install
+
+redland-bindings-1.0.17.1
+./configure --prefix=/Users/ymh/dev/venvs/redland --with-python=/Users/ymh/dev/venvs/redland/bin/python --with-python-ldflags="-L/Library/Frameworks/Python.framework/Versions/2.7/lib -lpython2.7 -Wl,-dylib
+make
+make install
+
+@author: ymh
+'''
+
+# hda:Domaine -> Domain
+# 
+# hda:SousDomaine -> Domain
+# 
+# hda:Institution -> Organisation ?
+# 
+# hda:Notice -> DataSheet
+# 
+# hda:Periode -> TimePeriod
+# 
+# hda:Site -> Organisation ?
+# 
+# hda:Theme -> Domain
+# 
+# hda:Ville -> Location
+# 
+# hda:DocumentTag -> TaggedSheet
+# 
+# hda:Tag -> Tag
+# 
+# hda:Category -> TagCategory
+
+
+
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db import transaction
+from hdabo.models import (Author, Datasheet, DocumentFormat, Domain, Organisation,
+    Tag, TaggedSheet, TimePeriod, Location)
+from hdabo.wp_utils import normalize_tag
+from optparse import make_option
+import csv
+import datetime
+import math
+import sys
+import os
+import shutil
+import rdflib
+import tempfile
+
+class GraphCache(object):
+    
+    def __init__(self, temp_folder=tempfile.tempdir, fmt='turtle'):
+        self.base_temp_folder = temp_folder
+        self.temp_folder = None
+        self.obj_cache = {}
+        self.format = fmt
+
+
+    def clear(self, keys=None):
+        """Clear from cache (memory)
+        
+        :param tuple keys: the key to clear. ``None`` will clear all.
+        """
+        if not keys:
+            self.obj_cache.clear()
+            return
+        if not isinstance(keys, tuple):
+            keys = (keys,)
+        current_cache_dict = self.obj_cache
+        for k in keys[:-1]:
+            current_cache_dict = current_cache_dict.get(k)
+            if not isinstance(current_cache_dict,dict):
+                raise KeyError("%r not found " % keys)
+        current_cache_dict.pop(keys[-1])
+
+    def purge(self, keys=None):
+        """Clear from disk, not from cache
+        
+        :param tuple keys: the key to clear. ``None`` will clear all.
+        """
+        if not keys:
+            path = self.temp_folder
+            self.temp_folder = None
+        else:
+            path = self.__build_path(keys)
+
+        if not os.path.exists(path):
+            raise KeyError("%r not found" % keys)
+        
+        if os.path.isfile(path):
+            os.remove(path)
+        else:
+            shutil.rmtree(path)
+
+
+    def get_from_cache(self, keys):
+        """get graph from memory cache.
+        :raises: KeyError if not found
+        """
+        if not keys:
+            raise KeyError("Keys is None or empty")
+        if not isinstance(keys, (tuple,list)):
+            keys = (keys,)
+        try:
+            return reduce(lambda d,k: d[k], keys, self.obj_cache)
+        except:
+            KeyError("%r not found" % keys)
+
+    def __build_path(self, keys):
+        if not keys:
+            raise KeyError("Keys is None or empty")
+        if not isinstance(keys, (tuple,list)):
+            keys = (keys,)
+
+        return os.path.join(self.temp_folder,os.path.join(*keys))
+
+    def get_from_disk(self, keys):
+        """get graph from disk cache
+        :raises: KeyError if file not found"""
+        path = self.__build_path(keys)
+        
+        if not os.path.exists(path):
+            raise KeyError("%r not found" % keys)
+        
+        if not os.path.isfile(path):
+            raise KeyError("%r found but not a file" % keys)
+        
+        g = rdflib.Graph()
+        try:
+            g.parse(path, format=self.format)
+        except Exception as e:
+            raise KeyError("Bad key %r. Error when reading file %r" % (keys, e))
+        return g
+
+
+    def put_in_cache(self, keys, g):
+        if not keys:
+            raise KeyError("Keys is None or empty")
+        if not isinstance(keys, (tuple,list)):
+            keys = (keys,)
+
+        reduce(lambda d,k: d.setdefault(k,{}), keys[:-1], self.obj_cache)[keys[-1]] = g
+
+    
+    def put_on_disk(self, keys, g):
+        if g is None:
+            raise Exception("Null graph")
+        if not self.temp_folder or not os.path.exists(self.temp_folder):
+            self.temp_folder = tempfile.mkdtemp(dir=self.base_temp_folder)
+        if not os.path.isdir(self.temp_folder):
+            raise Exception("Temp folder for disk cache is not a dir")
+
+        if len(keys)>1:
+            path_dir = self.__build_path(keys[:-1])
+            if not os.path.isdir(path_dir):
+                os.makedirs(path_dir)
+        path = self.__build_path(keys)
+        g.serialise(path)
+
+    def commit_cache(self):
+        folder_stack = []
+        folder_stack.append(([], self.obj_cache))
+        while len(folder_stack) > 0:
+            keys, obj_dict = folder_stack.pop()
+            for k,v in obj_dict.iteritems():
+                new_keys = keys + [k]
+                if isinstance(v, dict):
+                    folder_stack.append((new_keys, v))
+                else:
+                    self.put_on_disk(new_keys, v)
+
+
+    def get(self, keys):
+        if not keys:
+            return None
+        
+        try:
+            return self.get_from_cache(keys)
+        except KeyError:
+            value = self.get_from_disk(keys)
+            self.put_in_cache(keys, value)
+            return value
+    
+    
+    def put(self, keys, value, on_disk=True):
+        self.put_in_cache(keys, value)
+        if on_disk:
+            self.put_on_disk(keys, value)
+
+
+
+RDF_IMPORTERS = {}
+
+class RdfImporter(object):
+    def __init__(self, g):
+        self.graph = g
+    def import_graph(self):
+        raise NotImplementedError() 
+
+
+class Command(BaseCommand):
+    '''
+    Command to import csvfile
+    '''
+    args = '<path_to_csv_file path_to_csv_file ...>'
+    options = '[--ignore-existing] [--lines] [--encoding]'
+    help = """Import of a csv file for hdabo
+Options:
+    --ignore-existing : ignore existing datasheets
+    --lines : max number of lines to load (for each file). 0 means all.
+"""
+    
+    option_list = BaseCommand.option_list + (
+        make_option('--lines',
+            action='store',
+            type='int',
+            dest='lines',
+            default=0,
+            help='Number of lines to read. 0 means all.'),
+        make_option('--ignore-existing',
+            action='store_true',
+            dest='ignore_existing',
+            default=False,
+            help='force insertion'),
+        
+        )
+    
+    def show_progress(self, current_line, total_line, width):
+
+        percent = (float(current_line) / float(total_line)) * 100.0
+
+        marks = math.floor(width * (percent / 100.0))
+        spaces = math.floor(width - marks)
+    
+        loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']'
+    
+        sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line - 1, total_line - 1)) #takes the header into account
+        if percent >= 100:
+            sys.stdout.write("\n")
+        sys.stdout.flush()
+
+    
+    def create_domain_period(self, row_value, klass, school_period):
+        res_list = []
+        if not row_value:
+            return res_list
+        for label_str in [dstr.strip() for dstr in row_value.split('\x0b')]:
+            if label_str:
+                res_obj, created = klass.objects.get_or_create(label=label_str, school_period=school_period, defaults={"label":label_str, "school_period":school_period}) #@UnusedVariable
+                res_list.append(res_obj)
+        return res_list
+    
+    def create_datasheet(self, row):
+        
+        if self.ignore_existing and Datasheet.objects.filter(hda_id=row[u"ID"]).count() > 0:
+            return
+        
+        author_str = row[u'Auteur']
+        if author_str:
+            author_array = author_str.split(" ")
+            if len(author_array) == 0:
+                firstname = ""
+                lastname = ""
+            elif len(author_array) == 1:
+                firstname = ""
+                lastname = author_array[0]
+            elif len(author_array) == 2:
+                firstname = author_array[0]
+                lastname = author_array[1]
+                
+            author, created = Author.objects.get_or_create(hda_id=author_str, defaults={"firstname":firstname, "lastname":lastname}) #@UnusedVariable
+        else:
+            author = None
+        
+        org_str = row[u"Org"]    
+        if org_str:
+            url_str = row[u'Org_Home']
+            if url_str is not None:
+                url_str = url_str.strip()
+            org, created = Organisation.objects.get_or_create(hda_id=org_str, defaults={"name":org_str, "website" : url_str}) #@UnusedVariable
+        else:
+            org = None
+            
+        town_str = row[u"Ville"]
+        if town_str:
+            insee_str = row[u'Insee'].strip() if row[u'Insee'] else row[u'Insee']
+            if len(insee_str) > 5:
+                insee_str = "" 
+            loc, created = Location.objects.get_or_create(insee=insee_str, defaults={"name": town_str, "insee": insee_str}) #@UnusedVariable
+        else:
+            loc = None
+            
+        format_str = row[u"Format"]
+        if format_str:
+            format, created = DocumentFormat.objects.get_or_create(label=format_str, defaults={"label": format_str}) #@UnusedVariable
+        else:
+            format = None
+        
+        domains = self.create_domain_period(row[u"Domaine"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Global'])
+                                        
+        primary_periods = self.create_domain_period(row[u"Periode1"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Primaire'])
+        college_periods = self.create_domain_period(row[u"Periode2"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Collège'])
+        highschool_periods = self.create_domain_period(row[u"Periode3"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Lycée'])
+                    
+        primary_themes = self.create_domain_period(row[u"Sousdom"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Primaire'])
+        college_themes = self.create_domain_period(row[u"Theme2"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Collège'])
+        highschool_themes = self.create_domain_period(row[u"Theme3"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Lycée'])
+        
+        url = row[u"Url"]
+        if url is not None:
+            url = url.strip()
+        
+        datasheet = Datasheet.objects.create(
+            hda_id=row[u"ID"],
+            author=author,
+            organisation=org,
+            title=row[u"Titre"],
+            description=row[u"Desc"],
+            url=url,
+            town=loc,
+            format=format,
+            original_creation_date=datetime.datetime.strptime(row[u"Datcre"], "%d/%m/%Y").date(),
+            original_modification_date=datetime.datetime.strptime(row[u"Datmaj"], "%d/%m/%Y").date(),
+            validated=False
+        )
+        
+        datasheet.save()
+        
+        datasheet.set_domains(domains)
+        datasheet.set_primary_periods(primary_periods)
+        datasheet.set_college_periods(college_periods)
+        datasheet.set_highschool_periods(highschool_periods)
+        datasheet.set_primary_themes(primary_themes)
+        datasheet.set_college_themes(college_themes)
+        datasheet.set_highschool_themes(highschool_themes)
+
+        
+        if row[u'Tag']:
+            for i, tag in enumerate([t.strip() for t in row[u'Tag'].split(u";")]):
+                if len(tag) == 0:
+                    continue
+                tag_label = normalize_tag(tag)
+                tag_obj = None
+                for t in Tag.objects.filter(label__iexact=tag_label):
+                    if tag_obj is None or t.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
+                        tag_obj = t
+                        if tag_obj.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
+                            break
+ 
+                if tag_obj is None:
+                    tag_obj = Tag(label=tag_label, original_label=tag)
+                    tag_obj.save()
+
+                tagged_ds = TaggedSheet(datasheet=datasheet, tag=tag_obj, original_order=i + 1, order=i + 1)
+                tagged_ds.save()
+        
+
+    def handle(self, *args, **options):
+        
+        if len(args) == 0:
+            raise CommandError("Gives at last one rdf file to import")
+        
+        lines = options.get('lines', 0)
+        self.ignore_existing = options.get('ignore_existing', False)
+        
+        #open rdf file
+        graph_iterator = None
+        
+        #getting sizes and splitting rdf in "cache"
+        
+        #iterate on 1 level objects, counting by type
+        for rdfgraph in graph_iterator:
+            pass
+        #iterate on 1 level objects
+        for rdfgraph in graph_iterator:
+            #rdf_importer = RDF_IMPORTERS.get(k)
+            pass
+        
+