better migration for dbpedia uri
authorcavaliet
Fri, 31 Jan 2014 16:08:50 +0100
changeset 117 5771052a647a
parent 116 a023e0185a02
child 118 e9c1a6f4b83e
better migration for dbpedia uri
DoctrineMigrations/Version20140129151724.php
README.md
Utils/WikiTagUtils.php
--- a/DoctrineMigrations/Version20140129151724.php	Thu Jan 30 17:52:14 2014 +0100
+++ b/DoctrineMigrations/Version20140129151724.php	Fri Jan 31 16:08:50 2014 +0100
@@ -20,17 +20,26 @@
         // this up() migration is autogenerated, please modify it to your needs
         $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql");
         
+        
         // First we get all tags.
         $em = $GLOBALS["kernel"]->getContainer()->get( 'doctrine.orm.entity_manager' );
-        $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(40)->setFirstResult(5000);
+        // Avoid php annoying memory leaks
+        $em->getConnection()->getConfiguration()->setSQLLogger(null);
+        
+        // First step : we populate the dbpedia uris thanks to the dbpedia-owl:wikiPageID
+        echo "\nFIRST STEP";
+        $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(240)->setFirstResult(500);
         $tags = $query->getResult();
-        
         $i = 1;
+        $nb_set = 0;
         foreach($tags as $tag){
         	$l = $tag->getLabel();
-        	$uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false);
+        	$uri = WikiTagUtils::getDbpediaUri($tag->getWikipediaPageId(), [], false, "pageid");
         	$tag->setDbpediaUri($uri);
         	$em->persist($tag);
+         if($uri!=NULL && $uri!=""){
+             $nb_set++;
+         }
         	if( $i % 50 == 0 ){
         		$em->flush();
         		echo "\n    FLUSH";
@@ -39,6 +48,57 @@
         	echo "\n$i : $l \t\t: $uri";
         }
         $em->flush();
+        echo "\nFIRST STEP : $nb_set uris found";
+        
+        
+        // Second step : we populate the dbpedia uris not found thanks to the foaf:isPrimaryTopicOf
+        echo "\nSECOND STEP";
+        $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t WHERE (t.dbpediaUri=\'\' OR t.dbpediaUri IS NULL) ORDER BY t.label ASC');//->setMaxResults(240);
+        $tags = $query->getResult();
+        $i = 1;
+        $nb_set = 0;
+        foreach($tags as $tag){
+        	$l = $tag->getLabel();
+        	$uri = WikiTagUtils::getDbpediaUri($tag->getWikipediaUrl(), [], false, "wikiurl");
+        	$tag->setDbpediaUri($uri);
+        	$em->persist($tag);
+            if($uri!=NULL && $uri!=""){
+                $nb_set++;
+            }
+        	if( $i % 50 == 0 ){
+        		$em->flush();
+        		echo "\n    FLUSH";
+        	}
+        	$i++;
+        	echo "\n$i : $l \t\t: $uri";
+        }
+        $em->flush();
+        echo "\nSECOND STEP : $nb_set uris found";
+        
+        
+        // Third step : we populate the dbpedia uris not found thanks to the rdfs:label
+        echo "\nTHIRD STEP";
+        $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t WHERE (t.dbpediaUri=\'\' OR t.dbpediaUri IS NULL) ORDER BY t.label ASC');//->setMaxResults(240);
+        $tags = $query->getResult();
+        $i = 1;
+        $nb_set = 0;
+        foreach($tags as $tag){
+        	$l = $tag->getLabel();
+        	$uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false);
+        	$tag->setDbpediaUri($uri);
+        	$em->persist($tag);
+            if($uri!=NULL && $uri!=""){
+                $nb_set++;
+            }
+        	if( $i % 50 == 0 ){
+        		$em->flush();
+        		echo "\n    FLUSH";
+        	}
+        	$i++;
+        	echo "\n$i : $l \t\t: $uri";
+        }
+        $em->flush();
+        echo "\nTHIRD STEP : $nb_set uris found";
     }
 
     public function down(Schema $schema)
--- a/README.md	Thu Jan 30 17:52:14 2014 +0100
+++ b/README.md	Fri Jan 31 16:08:50 2014 +0100
@@ -332,4 +332,12 @@
   +  last doctags: php app/console wikitag:load-fixtures -B 40001 /path/to/data.json
 
 The -B (index Begin) and -E (index End) works alson on the tags. Therefore you cans import tags also in slices.
+
+## Migration
+
+The wikitag folder contains a migration in DoctrineMigrations/Version20140129151724.php. If your wikitag is anterior to V00.14, you need to to do this migration.
+This migration takes every tag label and searches the REAL dbpedia uri associated to this label.
+Before, the dbpedia uri was manually generated by http://dbpedia.org/resource/ + english_label.
+Now we get the dbpedia uri by requesting http://LANG_CODE.dbpedia.org/sparql with the current label. 
+
  
--- a/Utils/WikiTagUtils.php	Thu Jan 30 17:52:14 2014 +0100
+++ b/Utils/WikiTagUtils.php	Fri Jan 31 16:08:50 2014 +0100
@@ -281,16 +281,25 @@
     /**
      * Builds DbPedia URI
      */
-    public static function getDbpediaUri($label, $params=[], $throw_error=true)
+    public static function getDbpediaUri($label, $params=[], $throw_error=true, $req_param="label")
     {
     	// Get lang from url
     	$dbp_url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"];
     	$lang = substr($dbp_url, 7, 2);
+    	$query = 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }';
+    	if($req_param=="pageid"){
+    		$query = 'select distinct * where { ?s dbpedia-owl:wikiPageID '.$label.' }';
+    	}
+    	elseif ($req_param=="wikiurl"){
+    		$query = 'select distinct * where { ?s foaf:isPrimaryTopicOf <'.$label.'> }';
+    	}
+    	
+    	
     	// filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL"
     	//"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' }',
     	//"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }',
     	$params = [
-    		"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }',
+    		"query" => $query,
     		"format" => 'application/json',
     	];