--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/DoctrineMigrations/Version20140129151724.php Thu Jan 30 17:52:14 2014 +0100
@@ -0,0 +1,49 @@
+<?php
+
+namespace Application\Migrations;
+
+use Doctrine\DBAL\Migrations\AbstractMigration,
+ Doctrine\DBAL\Schema\Schema,
+ IRI\Bundle\WikiTagBundle\Utils\WikiTagUtils;
+
+/**
+ * Migration for WikiTagBundle <= V00.13
+ * This migration takes every tag label and search the REAL dbpedia uri associated to this label.
+ * Before, the dbpedia uri was manually generated by http://dbpedia.org/resource/ + english_label.
+ * Now we get the dbpedia uri by requesting http://LANG_CODE.dbpedia.org/sparql with the current label.
+ *
+ */
+class Version20140129151724 extends AbstractMigration
+{
+ public function up(Schema $schema)
+ {
+ // this up() migration is autogenerated, please modify it to your needs
+ $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql");
+
+ // First we get all tags.
+ $em = $GLOBALS["kernel"]->getContainer()->get( 'doctrine.orm.entity_manager' );
+ $query = $em->createQuery('SELECT t FROM WikiTagBundle:Tag t ORDER BY t.label ASC');//->setMaxResults(40)->setFirstResult(5000);
+ $tags = $query->getResult();
+
+ $i = 1;
+ foreach($tags as $tag){
+ $l = $tag->getLabel();
+ $uri = WikiTagUtils::getDbpediaUri($tag->getLabel(), [], false);
+ $tag->setDbpediaUri($uri);
+ $em->persist($tag);
+ if( $i % 50 == 0 ){
+ $em->flush();
+ echo "\n FLUSH";
+ }
+ $i++;
+ echo "\n$i : $l \t\t: $uri";
+ }
+ $em->flush();
+ }
+
+ public function down(Schema $schema)
+ {
+ // this down() migration is autogenerated, please modify it to your needs
+ $this->abortIf($this->connection->getDatabasePlatform()->getName() != "mysql");
+ }
+}
--- a/Utils/WikiTagUtils.php Wed Jan 29 12:16:16 2014 +0100
+++ b/Utils/WikiTagUtils.php Thu Jan 30 17:52:14 2014 +0100
@@ -190,7 +190,7 @@
* @param string $url
* @return object (json decoded)
*/
- private static function curlRequest($url)
+ private static function curlRequest($url, $throw_error=true)
{
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
@@ -219,7 +219,7 @@
$curl_error = curl_error($ch);
curl_close($ch);
- if ($curl_errno > 0) {
+ if ($curl_errno > 0 && $throw_error) {
throw new \Exception("$url\n request failed. cURLError #$curl_errno: $curl_error\n", $curl_errno, null);
}
@@ -281,16 +281,19 @@
/**
* Builds DbPedia URI
*/
- private static function getDbpediaUri($label, $params=[])
+ public static function getDbpediaUri($label, $params=[], $throw_error=true)
{
// Get lang from url
$dbp_url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"];
$lang = substr($dbp_url, 7, 2);
+ // filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL"
+ //"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' }',
+ //"query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }',
$params = [
- "query" => 'select distinct * where {?s rdfs:label "'.$label.'"@'.$lang.'}',
+ "query" => 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }',
"format" => 'application/json',
];
-
+
$params_str = '';
foreach ($params as $key => $value) {
if ($params_str==''){
@@ -303,15 +306,30 @@
$url = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates")["dbpedia_sparql"].'?'.$params_str;
- $res = WikiTagUtils::curlRequest($url);
+ $res = WikiTagUtils::curlRequest($url, $throw_error);
$val = json_decode($res, true);
$uri = "";
- if(array_key_exists("results", $val)){
- if(array_key_exists("bindings", $val["results"])){
- if(count($val["results"]["bindings"]) > 0){
- $uri = $val["results"]["bindings"][0]["s"]["value"];
- }
- }
+ if($val){
+ if(array_key_exists("results", $val)){
+ if(array_key_exists("bindings", $val["results"])){
+ $len = count($val["results"]["bindings"]);
+ if($len > 0){
+ $uri = $val["results"]["bindings"][0]["s"]["value"];
+ if($len>1){
+ // If there are several results, we test the "url label" to see if it matches the label.
+ // Why ? Because, for example "1000" gets "Category:1000" and "1000" as result.
+ // We keep this code to be safe but the regexp in the sparql request normally avoids this problem.
+ for($i=0;$i<$len;$i++){
+ $res_uri = $val["results"]["bindings"][$i]["s"]["value"];
+ $url_label = substr( $res_uri, strrpos( $res_uri, '/' )+1 );
+ if(str_replace(" ", "_", $label) == $url_label){
+ $uri = $res_uri;
+ }
+ }
+ }
+ }
+ }
+ }
}
return $uri;
}