Utils/WikiTagUtils.php
changeset 44 c114504de4a8
parent 43 54f204bceb28
child 50 e967654e90cb
equal deleted inserted replaced
41:1c4e3fdba170 44:c114504de4a8
    11     private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s";
    11     private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s";
    12     private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s";
    12     private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s";
    13     
    13     
    14     
    14     
    15     /**
    15     /**
    16      * Get or create tag. Returns an array(tag:WikiTagTag, revision_id=int, created:Boolean)
       
    17      */
       
    18     
       
    19     /**
       
    20      *
       
    21      * Enter description here ...
       
    22      * @param unknown_type $tag_label
       
    23      * @param unknown_type $doctrine
       
    24      * @return multitype:boolean Ambigous <NULL, \IRI\Bundle\WikiTagBundle\Entity\Tag> Ambigous <NULL, unknown, mixed, string> (array(\IRI\Bundle\WikiTagBundle\Model\TagInterface, revision_id=int, created:Boolean))
       
    25      */
       
    26     public static function getOrCreateTag($tag_label, $doctrine)
       
    27     {
       
    28         $tag_label_normalized = WikiTagUtils::normalizeTag($tag_label);
       
    29         // We get the wikipedia references for the tag_label
       
    30         // We get or create the tag object
       
    31         $tags = $doctrine->getRepository('WikiTagBundle:Tag')->findBy(array('normalizedLabel' => $tag_label_normalized));
       
    32         $tag = null;
       
    33         foreach ($tags as $t){
       
    34             if($tag==null || $t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){
       
    35                 $tag = $t;
       
    36                 if($t->getUrlStatus()!=Tag::$TAG_URL_STATUS_DICT['null_result']){
       
    37                     break;
       
    38                 }
       
    39             }
       
    40         }
       
    41         $wp_request_done = false;
       
    42         if($tag==null){
       
    43             $tag = new Tag();
       
    44             $tag->setLabel($tag_label_normalized);
       
    45             $tag->setOriginalLabel($tag_label);
       
    46             $tag->setNormalizedLabel($tag_label_normalized);
       
    47             $created = true;
       
    48         }
       
    49         else{
       
    50             $created = false;
       
    51             $match_exists = false;
       
    52             // Even if a tag with the normalised label exists, IF this tag is not wikipedia semantised,
       
    53             // we search if a wikipedia semantised version exists in the base
       
    54             foreach ($tags as $t){
       
    55                 if($t->getUrlStatus()==Tag::$TAG_URL_STATUS_DICT['match']){
       
    56                     $tag = $t;
       
    57                     $match_exists = true;
       
    58                     break;
       
    59                 }
       
    60             }
       
    61             if($match_exists==false){
       
    62                 $wp_response = WikiTagUtils::getWikipediaInfo($tag_label_normalized);
       
    63                 $status = $wp_response['status'];
       
    64                 if($status==Tag::$TAG_URL_STATUS_DICT['match']){
       
    65                     $tag = new Tag();
       
    66                     $tag->setLabel($tag_label_normalized);
       
    67                     $tag->setOriginalLabel($tag_label);
       
    68                     $tag->setNormalizedLabel($tag_label_normalized);
       
    69                     $created = true;
       
    70                     $wp_request_done = true;
       
    71                 }
       
    72             }
       
    73         }
       
    74         
       
    75         // We request Wikipedia if the tag is created
       
    76         if($created==true){
       
    77             if($wp_request_done==false){
       
    78                 $wp_response = WikiTagUtils::getWikipediaInfo($tag_label_normalized);
       
    79             }
       
    80             $new_label = $wp_response['new_label'];
       
    81             $status = $wp_response['status'];
       
    82             $url = $wp_response['wikipedia_url'];
       
    83             $pageid = $wp_response['pageid'];
       
    84             $dbpedia_uri = $wp_response["dbpedia_uri"];
       
    85             $wikipedia_revision_id = $wp_response['revision_id'];
       
    86             
       
    87             # We save the datas
       
    88             if($new_label!=null){
       
    89                 $tag->setLabel($new_label);
       
    90             }
       
    91             if($status!=null){
       
    92                 $tag->setUrlStatus($status);
       
    93             }
       
    94             $tag->setWikipediaUrl($url);
       
    95             $tag->setWikipediaPageId($pageid);
       
    96             $tag->setDbpediaUri($dbpedia_uri);
       
    97             
       
    98             // Save datas.
       
    99             $em = $doctrine->getEntityManager();
       
   100             $em->persist($tag);
       
   101             $em->flush();
       
   102             
       
   103         }
       
   104         else if($tag!=null && $tag->getWikipediaPageId()!=null){
       
   105             $wp_response = WikiTagUtils::getWikipediaInfo(null, $tag->getWikipediaPageId());
       
   106             $wikipedia_revision_id = $wp_response['revision_id'];
       
   107         }
       
   108         else{
       
   109             $wikipedia_revision_id = null;
       
   110         }
       
   111         
       
   112         return array($tag, $wikipedia_revision_id, $created);//, $wpReponse);
       
   113     }
       
   114     
       
   115     /**
       
   116      * Cleans the tag label
    16      * Cleans the tag label
   117      */
    17      */
   118     public static function normalizeTag($tag_label)
    18     public static function normalizeTag($tag_label)
   119     {
    19     {
   120         if(strlen($tag_label)==0){
    20         if(strlen($tag_label)==0){
   121             return $tag_label;
    21             return $tag_label;
   122         }
    22         }
   123         $tag_label = trim($tag_label);//tag.strip()
    23         $tag_label = trim($tag_label);//tag.strip()
   124         $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ")
    24         $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ")
   125         $tag_label = str_replace("Œ", "oe", $tag_label);
    25         $tag_label = preg_replace('/\s+/u', ' ', $tag_label);//" ".join(tag.split())
   126         $tag_label = str_replace("œ", "oe", $tag_label);
       
   127         $tag_label = preg_replace('/\s+/', ' ', $tag_label);//" ".join(tag.split())
       
   128         $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:]
    26         $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:]
   129         return $tag_label;
    27         return $tag_label;
   130     }
    28     }
   131     
    29     
   132     /**
    30     /**
       
    31      * Query wikipedia with a normalized label or a pageid
       
    32      * return an array with the form
       
    33      * array(
       
    34      *      'new_label'=>$new_label,
       
    35      *   	'alternative_label'=>$alternative_label,
       
    36      *   	'status'=>$status,
       
    37      *   	'wikipedia_url'=>$url,
       
    38      *      'wikipedia_alternative_url'=>$alternative_url,
       
    39      *   	'pageid'=>$pageid,
       
    40      *   	'alternative_pageid'=>$alternative_pageid,
       
    41      *   	'dbpedia_uri'=>$dbpedia_uri,
       
    42      *   	'revision_id'=> ,
       
    43      *   	'response'=> the original wikipedia json response
       
    44      *   	)
   133      *
    45      *
   134      * TODO: Enter description here ...
    46      * @param string $tag_label_normalized
   135      * @param unknown_type $tag_label_normalized
    47      * @param bigint $page_id
   136      * @param unknown_type $page_id
    48      * @return array
   137      * @return multitype:NULL unknown |multitype:Ambigous <NULL, unknown> multitype:number  mixed Ambigous <NULL, string> Ambigous <unknown, mixed>
       
   138      */
    49      */
   139     public static function getWikipediaInfo($tag_label_normalized, $page_id=null)
    50     public static function getWikipediaInfo($tag_label_normalized, $page_id=null)
   140     {
    51     {
   141         $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json');
    52         $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json');
   142         if($tag_label_normalized!=null){
    53         if($tag_label_normalized!=null){
   179         else{
    90         else{
   180             //return " MATCH";
    91             //return " MATCH";
   181             $status = Tag::$TAG_URL_STATUS_DICT["match"];
    92             $status = Tag::$TAG_URL_STATUS_DICT["match"];
   182         }
    93         }
   183         // In redirection, we have to get more datas by adding redirects=true to the params
    94         // In redirection, we have to get more datas by adding redirects=true to the params
   184         if($status==Tag::$TAG_URL_STATUS_DICT["redirection"]){
    95         $alternative_label = null;
       
    96         $alternative_url = null;
       
    97         $alternative_pageid = null;
       
    98         if($status==Tag::$TAG_URL_STATUS_DICT["redirection"])
       
    99         {
   185             $params['redirects'] = "true";
   100             $params['redirects'] = "true";
   186             $ar = WikiTagUtils::requestWikipedia($params);
   101             $ar = WikiTagUtils::requestWikipedia($params);
   187             $res = $ar[0];
   102             $res = $ar[0];
   188             $pages = $ar[1];
   103             $pages = $ar[1];
   189             #we know that we have at least one answer
   104             #we know that we have at least one answer
   190             if(count($pages)>1 || count($pages)==0){
   105             if(count($pages)>1 || count($pages)==0){
   191                 return WikiTagUtils::returnNullResult($res);
   106                 return WikiTagUtils::returnNullResult($res);
   192             }
   107             }
   193             // get first result
   108             // get first result
   194             $page = reset($pages);
   109             $page = reset($pages);
       
   110             $alternative_label = array_key_exists('title', $page) ? $page['title'] : null;
       
   111             $alternative_url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
       
   112             $alternative_pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
   195         }
   113         }
   196         
   114         
   197         $revision_id = $page['lastrevid'];
   115         $revision_id = $page['lastrevid'];
   198         
   116         
   199         // process language to extract the english label
   117         // process language to extract the english label
   212         $dbpedia_uri = null;
   130         $dbpedia_uri = null;
   213         if($english_label!=null && strpos($english_label, '#')===false){
   131         if($english_label!=null && strpos($english_label, '#')===false){
   214             $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label);
   132             $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label);
   215         }
   133         }
   216         
   134         
   217         $wp_response = array('new_label'=>$new_label, 'status'=>$status, 'wikipedia_url'=>$url, 'pageid'=>$pageid, 'dbpedia_uri'=>$dbpedia_uri, 'revision_id'=>$revision_id, 'response'=>$original_response);
   135         $wp_response = array(
       
   136             'new_label'=>$new_label,
       
   137         	'alternative_label'=>$alternative_label,
       
   138         	'status'=>$status,
       
   139         	'wikipedia_url'=>$url,
       
   140             'wikipedia_alternative_url'=>$alternative_url,
       
   141         	'pageid'=>$pageid,
       
   142         	'alternative_pageid'=>$alternative_pageid,
       
   143         	'dbpedia_uri'=>$dbpedia_uri,
       
   144         	'revision_id'=>$revision_id,
       
   145         	'response'=>$original_response);
   218         //return $url." <br/>RES =  ".$res/*." <br/>DUMP =  ".var_dump($pages)*/." <br/>COUNT =  ".count($pages)." <br/>page =  ".var_dump($page);
   146         //return $url." <br/>RES =  ".$res/*." <br/>DUMP =  ".var_dump($pages)*/." <br/>COUNT =  ".count($pages)." <br/>page =  ".var_dump($page);
   219         return $wp_response;
   147         return $wp_response;
   220     }
   148     }
   221     
   149     
   222 
   150 
   223     /**
   151     /**
       
   152      * build and do the request to Wikipedia.
   224      *
   153      *
   225      * TODO : Enter description here ...
   154      * @param array $params
   226      * @param unknown_type $params
   155      * @return array
   227      * @return multitype:unknown mixed
       
   228      */
   156      */
   229     private static function requestWikipedia($params)
   157     private static function requestWikipedia($params)
   230     {
   158     {
   231         $params_str = '';
   159         $params_str = '';
   232         foreach ($params as $key => $value) {
   160         foreach ($params as $key => $value) {