migration to real dbpedia uri
authorcavaliet
Thu, 30 Jan 2014 17:52:14 +0100
changeset 116 a023e0185a02
parent 115 085ea4dbfeee
child 117 5771052a647a
migration to real dbpedia uri
DoctrineMigrations/Version20140129151724.php
Utils/WikiTagUtils.php
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DoctrineMigrations/Version20140129151724.php	Thu Jan 30 17:52:14 2014 +0100
@@ -0,0 +1,49 @@
+<?php
+
+namespace Application\Migrations;
+
+use Doctrine\DBAL\Migrations\AbstractMigration,
+    Doctrine\DBAL\Schema\Schema,
+	IRI\Bundle\WikiTagBundle\Utils\WikiTagUtils;
+
+/**
+ * Migration for WikiTagBundle <= V00.13
+ * This migration takes every tag label and search the REAL dbpedia uri associated to this label.
+ * Before, the dbpedia uri was manually generated by http://dbpedia.org/resource/ + english_label.
+ * Now we get the dbpedia uri by requesting http://LANG_CODE.dbpedia.org/sparql with the current label.
+ * 
+ */
+class Version20140129151724 extends AbstractMigration
+{
+    public function up(Schema $schema)
+    {
+        // this up() migration is autogenerated, please modify it to your needs
+        $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql");
+        
+        // First we get all tags.
+        $em = $GLOBALS["kernel"]->getContainer()->get( 'doctrine.orm.entity_manager' );
+        $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(40)->setFirstResult(5000);
+        $tags = $query->getResult();
+        
+        $i = 1;
+        foreach($tags as $tag){
+        	$l = $tag->getLabel();
+        	$uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false);
+        	$tag->setDbpediaUri($uri);
+        	$em->persist($tag);
+        	if( $i % 50 == 0 ){
+        		$em->flush();
+        		echo "\n    FLUSH";
+        	}
+        	$i++;
+        	echo "\n$i : $l \t\t: $uri";
+        }
+        $em->flush();
+    }
+
+    public function down(Schema $schema)
+    {
+        // this down() migration is autogenerated, please modify it to your needs
+        $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql");
+    }
+}
--- a/Utils/WikiTagUtils.php	Wed Jan 29 12:16:16 2014 +0100
+++ b/Utils/WikiTagUtils.php	Thu Jan 30 17:52:14 2014 +0100
@@ -190,7 +190,7 @@
      * @param string $url
      * @return object (json decoded)
      */
-    private static function curlRequest($url)
+    private static function curlRequest($url, $throw_error=true)
     {
     	$ch = curl_init();
     	curl_setopt($ch, CURLOPT_URL, $url);
@@ -219,7 +219,7 @@
     	$curl_error = curl_error($ch);
     	curl_close($ch);
     
-    	if ($curl_errno > 0) {
+    	if ($curl_errno > 0 && $throw_error) {
     		throw new \Exception("$url\n request failed. cURLError #$curl_errno: $curl_error\n", $curl_errno, null);
     	}
     	
@@ -281,16 +281,19 @@
     /**
      * Builds DbPedia URI
      */
-    private static function getDbpediaUri($label, $params=[])
+    public static function getDbpediaUri($label, $params=[], $throw_error=true)
     {
     	// Get lang from url
     	$dbp_url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"];
     	$lang = substr($dbp_url, 7, 2);
+    	// filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL"
+    	//"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' }',
+    	//"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }',
     	$params = [
-    		"query" => 'select distinct * where {?s rdfs:label "'.$label.'"@'.$lang.'}',
+    		"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }',
     		"format" => 'application/json',
     	];
-
+    	
     	$params_str = '';
     	foreach ($params as $key => $value) {
     		if ($params_str==''){
@@ -303,15 +306,30 @@
     	
     	$url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"].'?'.$params_str;
     	
-    	$res = WikiTagUtils::curlRequest($url);
+    	$res = WikiTagUtils::curlRequest($url, $throw_error);
     	$val = json_decode($res, true);
     	$uri = "";
-    	if(array_key_exists("results", $val)){
-    		if(array_key_exists("bindings", $val["results"])){
-    			if(count($val["results"]["bindings"]) > 0){
-    				$uri = $val["results"]["bindings"][0]["s"]["value"];
-    			}
-    		}
+    	if($val){
+	    	if(array_key_exists("results", $val)){
+	    		if(array_key_exists("bindings", $val["results"])){
+	    			$len = count($val["results"]["bindings"]);
+	    			if($len > 0){
+	    				$uri = $val["results"]["bindings"][0]["s"]["value"];
+	    				if($len>1){
+	    					// If there are several results, we test the "url label" to see if it matches the label.
+	    					// Why ? Because, for example "1000" gets "Category:1000" and "1000" as result.
+	    					// We keep this code to be safe but the regexp in the sparql request normally avoids this problem.
+	    					for($i=0;$i<$len;$i++){
+	    						$res_uri = $val["results"]["bindings"][$i]["s"]["value"];
+	    						$url_label = substr( $res_uri, strrpos( $res_uri, '/' )+1 );
+	    						if(str_replace(" ", "_", $label) == $url_label){
+	    							$uri = $res_uri;
+	    						}
+	    					}
+	    				}
+	    			}
+	    		}
+	    	}
     	}
     	return $uri;
     }