--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/hdabo/management/commands/import_rdf.py Wed Dec 10 10:05:38 2014 +0100
@@ -0,0 +1,455 @@
+# -*- coding: utf-8 -*-
+'''
+Created on May 25, 2011
+
+
+Compilation:
+
+#Install librdf
+# install raptor2 : configure --prefix=<path to venv> + make + make install
+# install librasql : PKG_CONFIG_PATH=/Users/ymh/dev/venvs/hdalab/lib/pkgconfig RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 ./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+
+raptor2-2.0.15
+configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+rasqal-0.9.32:
+
+diff --git a/rasqal-0.9.32/configure b/rasqal-0.9.32.new/configure
+index a29a606..b1dda08 100755
+--- a/rasqal-0.9.32/configure
++++ b/rasqal-0.9.32.new/configure
+@@ -14105,7 +14105,7 @@ else
+ { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+ $as_echo "yes" >&6; }
+
+- RAPTOR_VERSION=`$PKG_CONFIG raptor2 --modversion 2>/dev/null`
++ RAPTOR_VERSION="2.0.15" #`$PKG_CONFIG raptor2 --modversion 2>/dev/null`
+ raptor_too_old=0
+ as_arg_v1=$RAPTOR_VERSION
+ as_arg_v2=$RAPTOR_MIN_VERSION
+
+RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 ./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+sqlite:
+./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+unixODBC:
+./configure --prefix=/Users/ymh/dev/venvs/hdalab --enable-gui=no
+make
+make install
+
+redland-1.0.17:
+CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include" PKG_CONFIG_PATH=/Users/ymh/dev/venvs/hdalab/lib/pkgconfig RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 RASQAL_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lrasqal" RASQAL_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/rasqal" ./configure --prefix=/Users/ymh/dev/venvs/hdalab
+
+redland-bindings
+CFLAGS="-L/Users/ymh/dev/venvs/hdalab/lib -I/Users/ymh/dev/venvs/hdalab/include -I/Users/ymh/dev/venvs/hdalab/include/raptor2 -I/Users/ymh/dev/venvs/hdalab/include/rasqal -lraptor2 -lrasqal -lrdf" PKG_CONFIG_PATH=/Users/ymh/dev/venvs/hdalab/lib/pkgconfig RAPTOR2_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lraptor2" RAPTOR2_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/raptor2" RAPTOR_VERSION=2.0.15 RASQAL_LIBS="-L/Users/ymh/dev/venvs/hdalab/lib -lrasqal" RASQAL_CFLAGS="-I/Users/ymh/dev/venvs/hdalab/include/rasqal" ./configure --prefix=/Users/ymh/dev/venvs/hdalab --with-python=/Users/ymh/dev/venvs/hdalab/bin/python --with-python-ldflags="-L/Users/ymh/dev/venvs/hdalab/lib -L/System/Library/Frameworks/Python.framework/Versions/2.7/lib -lpython2.7 -v -Wl,-dylib" --with-python-libext=.so
+
+
+---
+prerequisite:
+- pkg-config:
+./configure --prefix=/Users/ymh/dev/venvs/redland --with-internal-glib && make && make install
+
+- sqlite:
+./configure --prefix=/Users/ymh/dev/venvs/redland && make && make install
+
+unixODBC:
+./configure --prefix=/Users/ymh/dev/venvs/redland --enable-gui=no
+make
+make install
+
+raptor2-2.0.15
+configure --prefix=/Users/ymh/dev/venvs/hdalab
+make
+make install
+
+
+rasqal-0.9.32
+configure --prefix=/Users/ymh/dev/venvs/hdalab
+make
+make install
+
+redland-1.0.17
+CFLAGS="-I/Users/ymh/dev/venvs/redland/include" ./configure --prefix=/Users/ymh/dev/venvs/redland
+make
+make install
+
+redland-bindings-1.0.17.1
+./configure --prefix=/Users/ymh/dev/venvs/redland --with-python=/Users/ymh/dev/venvs/redland/bin/python --with-python-ldflags="-L/Library/Frameworks/Python.framework/Versions/2.7/lib -lpython2.7 -Wl,-dylib
+make
+make install
+
+@author: ymh
+'''
+
+# hda:Domaine -> Domain
+#
+# hda:SousDomaine -> Domain
+#
+# hda:Institution -> Organisation ?
+#
+# hda:Notice -> DataSheet
+#
+# hda:Periode -> TimePeriod
+#
+# hda:Site -> Organisation ?
+#
+# hda:Theme -> Domain
+#
+# hda:Ville -> Location
+#
+# hda:DocumentTag -> TaggedSheet
+#
+# hda:Tag -> Tag
+#
+# hda:Category -> TagCategory
+
+
+
+
+from django.core.management.base import BaseCommand, CommandError
+from django.db import transaction
+from hdabo.models import (Author, Datasheet, DocumentFormat, Domain, Organisation,
+ Tag, TaggedSheet, TimePeriod, Location)
+from hdabo.wp_utils import normalize_tag
+from optparse import make_option
+import csv
+import datetime
+import math
+import sys
+import os
+import shutil
+import rdflib
+import tempfile
+
+class GraphCache(object):
+
+ def __init__(self, temp_folder=tempfile.tempdir, fmt='turtle'):
+ self.base_temp_folder = temp_folder
+ self.temp_folder = None
+ self.obj_cache = {}
+ self.format = fmt
+
+
+ def clear(self, keys=None):
+ """Clear from cache (memory)
+
+ :param tuple keys: the key to clear. ``None`` will clear all.
+ """
+ if not keys:
+ self.obj_cache.clear()
+ return
+ if not isinstance(keys, tuple):
+ keys = (keys,)
+ current_cache_dict = self.obj_cache
+ for k in keys[:-1]:
+ current_cache_dict = current_cache_dict.get(k)
+ if not isinstance(current_cache_dict,dict):
+ raise KeyError("%r not found " % keys)
+ current_cache_dict.pop(keys[-1])
+
+ def purge(self, keys=None):
+ """Clear from disk, not from cache
+
+ :param tuple keys: the key to clear. ``None`` will clear all.
+ """
+ if not keys:
+ path = self.temp_folder
+ self.temp_folder = None
+ else:
+ path = self.__build_path(keys)
+
+ if not os.path.exists(path):
+ raise KeyError("%r not found" % keys)
+
+ if os.path.isfile(path):
+ os.remove(path)
+ else:
+ shutil.rmtree(path)
+
+
+ def get_from_cache(self, keys):
+ """get graph from memory cache.
+ :raises: KeyError if not found
+ """
+ if not keys:
+ raise KeyError("Keys is None or empty")
+ if not isinstance(keys, (tuple,list)):
+ keys = (keys,)
+ try:
+ return reduce(lambda d,k: d[k], keys, self.obj_cache)
+ except:
+ KeyError("%r not found" % keys)
+
+ def __build_path(self, keys):
+ if not keys:
+ raise KeyError("Keys is None or empty")
+ if not isinstance(keys, (tuple,list)):
+ keys = (keys,)
+
+ return os.path.join(self.temp_folder,os.path.join(*keys))
+
+ def get_from_disk(self, keys):
+ """get graph from disk cache
+ :raises: KeyError if file not found"""
+ path = self.__build_path(keys)
+
+ if not os.path.exists(path):
+ raise KeyError("%r not found" % keys)
+
+ if not os.path.isfile(path):
+ raise KeyError("%r found but not a file" % keys)
+
+ g = rdflib.Graph()
+ try:
+ g.parse(path, format=self.format)
+ except Exception as e:
+ raise KeyError("Bad key %r. Error when reading file %r" % (keys, e))
+ return g
+
+
+ def put_in_cache(self, keys, g):
+ if not keys:
+ raise KeyError("Keys is None or empty")
+ if not isinstance(keys, (tuple,list)):
+ keys = (keys,)
+
+ reduce(lambda d,k: d.setdefault(k,{}), keys[:-1], self.obj_cache)[keys[-1]] = g
+
+
+ def put_on_disk(self, keys, g):
+ if g is None:
+ raise Exception("Null graph")
+ if not self.temp_folder or not os.path.exists(self.temp_folder):
+ self.temp_folder = tempfile.mkdtemp(dir=self.base_temp_folder)
+ if not os.path.isdir(self.temp_folder):
+ raise Exception("Temp folder for disk cache is not a dir")
+
+ if len(keys)>1:
+ path_dir = self.__build_path(keys[:-1])
+ if not os.path.isdir(path_dir):
+ os.makedirs(path_dir)
+ path = self.__build_path(keys)
+ g.serialise(path)
+
+ def commit_cache(self):
+ folder_stack = []
+ folder_stack.append(([], self.obj_cache))
+ while len(folder_stack) > 0:
+ keys, obj_dict = folder_stack.pop()
+ for k,v in obj_dict.iteritems():
+ new_keys = keys + [k]
+ if isinstance(v, dict):
+ folder_stack.append((new_keys, v))
+ else:
+ self.put_on_disk(new_keys, v)
+
+
+ def get(self, keys):
+ if not keys:
+ return None
+
+ try:
+ return self.get_from_cache(keys)
+ except KeyError:
+ value = self.get_from_disk(keys)
+ self.put_in_cache(keys, value)
+ return value
+
+
+ def put(self, keys, value, on_disk=True):
+ self.put_in_cache(keys, value)
+ if on_disk:
+ self.put_on_disk(keys, value)
+
+
+
+RDF_IMPORTERS = {}
+
+class RdfImporter(object):
+ def __init__(self, g):
+ self.graph = g
+ def import_graph(self):
+ raise NotImplementedError()
+
+
+class Command(BaseCommand):
+ '''
+ Command to import csvfile
+ '''
+ args = '<path_to_csv_file path_to_csv_file ...>'
+ options = '[--ignore-existing] [--lines] [--encoding]'
+ help = """Import of a csv file for hdabo
+Options:
+ --ignore-existing : ignore existing datasheets
+ --lines : max number of lines to load (for each file). 0 means all.
+"""
+
+ option_list = BaseCommand.option_list + (
+ make_option('--lines',
+ action='store',
+ type='int',
+ dest='lines',
+ default=0,
+ help='Number of lines to read. 0 means all.'),
+ make_option('--ignore-existing',
+ action='store_true',
+ dest='ignore_existing',
+ default=False,
+ help='force insertion'),
+
+ )
+
+ def show_progress(self, current_line, total_line, width):
+
+ percent = (float(current_line) / float(total_line)) * 100.0
+
+ marks = math.floor(width * (percent / 100.0))
+ spaces = math.floor(width - marks)
+
+ loader = '[' + ('=' * int(marks)) + (' ' * int(spaces)) + ']'
+
+ sys.stdout.write("%s %d%% %d/%d\r" % (loader, percent, current_line - 1, total_line - 1)) #takes the header into account
+ if percent >= 100:
+ sys.stdout.write("\n")
+ sys.stdout.flush()
+
+
+ def create_domain_period(self, row_value, klass, school_period):
+ res_list = []
+ if not row_value:
+ return res_list
+ for label_str in [dstr.strip() for dstr in row_value.split('\x0b')]:
+ if label_str:
+ res_obj, created = klass.objects.get_or_create(label=label_str, school_period=school_period, defaults={"label":label_str, "school_period":school_period}) #@UnusedVariable
+ res_list.append(res_obj)
+ return res_list
+
+ def create_datasheet(self, row):
+
+ if self.ignore_existing and Datasheet.objects.filter(hda_id=row[u"ID"]).count() > 0:
+ return
+
+ author_str = row[u'Auteur']
+ if author_str:
+ author_array = author_str.split(" ")
+ if len(author_array) == 0:
+ firstname = ""
+ lastname = ""
+ elif len(author_array) == 1:
+ firstname = ""
+ lastname = author_array[0]
+ elif len(author_array) == 2:
+ firstname = author_array[0]
+ lastname = author_array[1]
+
+ author, created = Author.objects.get_or_create(hda_id=author_str, defaults={"firstname":firstname, "lastname":lastname}) #@UnusedVariable
+ else:
+ author = None
+
+ org_str = row[u"Org"]
+ if org_str:
+ url_str = row[u'Org_Home']
+ if url_str is not None:
+ url_str = url_str.strip()
+ org, created = Organisation.objects.get_or_create(hda_id=org_str, defaults={"name":org_str, "website" : url_str}) #@UnusedVariable
+ else:
+ org = None
+
+ town_str = row[u"Ville"]
+ if town_str:
+ insee_str = row[u'Insee'].strip() if row[u'Insee'] else row[u'Insee']
+ if len(insee_str) > 5:
+ insee_str = ""
+ loc, created = Location.objects.get_or_create(insee=insee_str, defaults={"name": town_str, "insee": insee_str}) #@UnusedVariable
+ else:
+ loc = None
+
+ format_str = row[u"Format"]
+ if format_str:
+ format, created = DocumentFormat.objects.get_or_create(label=format_str, defaults={"label": format_str}) #@UnusedVariable
+ else:
+ format = None
+
+ domains = self.create_domain_period(row[u"Domaine"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Global'])
+
+ primary_periods = self.create_domain_period(row[u"Periode1"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Primaire'])
+ college_periods = self.create_domain_period(row[u"Periode2"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Collège'])
+ highschool_periods = self.create_domain_period(row[u"Periode3"], TimePeriod, TimePeriod.TIME_PERIOD_DICT[u'Lycée'])
+
+ primary_themes = self.create_domain_period(row[u"Sousdom"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Primaire'])
+ college_themes = self.create_domain_period(row[u"Theme2"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Collège'])
+ highschool_themes = self.create_domain_period(row[u"Theme3"], Domain, Domain.DOMAIN_PERIOD_DICT[u'Lycée'])
+
+ url = row[u"Url"]
+ if url is not None:
+ url = url.strip()
+
+ datasheet = Datasheet.objects.create(
+ hda_id=row[u"ID"],
+ author=author,
+ organisation=org,
+ title=row[u"Titre"],
+ description=row[u"Desc"],
+ url=url,
+ town=loc,
+ format=format,
+ original_creation_date=datetime.datetime.strptime(row[u"Datcre"], "%d/%m/%Y").date(),
+ original_modification_date=datetime.datetime.strptime(row[u"Datmaj"], "%d/%m/%Y").date(),
+ validated=False
+ )
+
+ datasheet.save()
+
+ datasheet.set_domains(domains)
+ datasheet.set_primary_periods(primary_periods)
+ datasheet.set_college_periods(college_periods)
+ datasheet.set_highschool_periods(highschool_periods)
+ datasheet.set_primary_themes(primary_themes)
+ datasheet.set_college_themes(college_themes)
+ datasheet.set_highschool_themes(highschool_themes)
+
+
+ if row[u'Tag']:
+ for i, tag in enumerate([t.strip() for t in row[u'Tag'].split(u";")]):
+ if len(tag) == 0:
+ continue
+ tag_label = normalize_tag(tag)
+ tag_obj = None
+ for t in Tag.objects.filter(label__iexact=tag_label):
+ if tag_obj is None or t.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
+ tag_obj = t
+ if tag_obj.url_status != Tag.TAG_URL_STATUS_DICT['null_result']:
+ break
+
+ if tag_obj is None:
+ tag_obj = Tag(label=tag_label, original_label=tag)
+ tag_obj.save()
+
+ tagged_ds = TaggedSheet(datasheet=datasheet, tag=tag_obj, original_order=i + 1, order=i + 1)
+ tagged_ds.save()
+
+
+ def handle(self, *args, **options):
+
+ if len(args) == 0:
+ raise CommandError("Gives at last one rdf file to import")
+
+ lines = options.get('lines', 0)
+ self.ignore_existing = options.get('ignore_existing', False)
+
+ #open rdf file
+ graph_iterator = None
+
+ #getting sizes and splitting rdf in "cache"
+
+ #iterate on 1 level objects, counting by type
+ for rdfgraph in graph_iterator:
+ pass
+ #iterate on 1 level objects
+ for rdfgraph in graph_iterator:
+ #rdf_importer = RDF_IMPORTERS.get(k)
+ pass
+
+