Utils/WikiTagUtils.php
author ymh <ymh.work@gmail.com>
Fri, 25 Nov 2011 18:55:42 +0100
changeset 42 0e57c730bb18
parent 32 38dcd2db04e4
child 43 54f204bceb28
permissions -rwxr-xr-x
Documentation and add alternative wp url and label + migrations

<?php

namespace IRI\Bundle\WikiTagBundle\Utils;

use IRI\Bundle\WikiTagBundle\Entity\Tag;

class WikiTagUtils
{
    // Constants
    private static $WIKIPEDIA_API_URL = "http://fr.wikipedia.org/w/api.php";
    private static $WIKIPEDIA_VERSION_PERMALINK_TEMPLATE = "http://fr.wikipedia.org/w/index.php?oldid=%s";
    private static $DBPEDIA_URI_TEMPLATE = "http://dbpedia.org/resource/%s";
    
    
    /**
     * Cleans the tag label
     */
    public static function normalizeTag($tag_label)
    {
        if(strlen($tag_label)==0){
            return $tag_label;
        }
        $tag_label = trim($tag_label);//tag.strip()
        $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ")
        $tag_label = str_replace("Œ", "oe", $tag_label);
        $tag_label = str_replace("œ", "oe", $tag_label);
        $tag_label = preg_replace('/\s+/', ' ', $tag_label);//" ".join(tag.split())
        $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:]
        return $tag_label;
    }
    
    /**
     *
     * TODO: Enter description here ...
     * @param unknown_type $tag_label_normalized
     * @param unknown_type $page_id
     * @return multitype:NULL unknown |multitype:Ambigous <NULL, unknown> multitype:number  mixed Ambigous <NULL, string> Ambigous <unknown, mixed>
     */
    public static function getWikipediaInfo($tag_label_normalized, $page_id=null)
    {
        $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json');
        if($tag_label_normalized!=null){
            $params['titles'] = urlencode($tag_label_normalized);
        }
        else if($page_id!=null){
            $params['pageids'] = $page_id;
        }
        else{
            return WikiTagUtils::returnNullResult(null);
        }
        
        $ar = WikiTagUtils::requestWikipedia($params);
        $res = $ar[0];
        $original_response = $res;
        $pages = $ar[1];
        // If there 0 or more than 1 result, the query has failed
        if(count($pages)>1 || count($pages)==0){
            return WikiTagUtils::returnNullResult($res);
        }
        // get first result
        $page = reset($pages);
        // Unknow entry ?
        if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){
            return WikiTagUtils::returnNullResult($res);
        }
        // The entry exists, we get the datas.
        $url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
        $pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
        $new_label = array_key_exists('title', $page) ? $page['title'] : null;
        // We test the status (redirect first because a redirect has no categories key)
        if(array_key_exists('redirect', $page)){
            //return " REDIRECT";
            $status = Tag::$TAG_URL_STATUS_DICT["redirection"];
        }
        else if(WikiTagUtils::isHomonymy($page)){
            //return " HOMONYMY";
            $status = Tag::$TAG_URL_STATUS_DICT["homonyme"];
        }
        else{
            //return " MATCH";
            $status = Tag::$TAG_URL_STATUS_DICT["match"];
        }
        // In redirection, we have to get more datas by adding redirects=true to the params
        $alternative_label = null;
        $alternative_url = null;
        $alternative_pageid = null;
        if($status==Tag::$TAG_URL_STATUS_DICT["redirection"])
        {
            //TODO: add alternative label
            $params['redirects'] = "true";
            $ar = WikiTagUtils::requestWikipedia($params);
            $res = $ar[0];
            $pages = $ar[1];
            #we know that we have at least one answer
            if(count($pages)>1 || count($pages)==0){
                return WikiTagUtils::returnNullResult($res);
            }
            // get first result
            $page = reset($pages);
            $alternative_label = array_key_exists('title', $page) ? $page['title'] : null;
            $alternative_url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
            $alternative_pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
        }
        
        $revision_id = $page['lastrevid'];
        
        // process language to extract the english label
        $english_label = null;
        if($status==Tag::$TAG_URL_STATUS_DICT["match"] || $status==Tag::$TAG_URL_STATUS_DICT["redirection"]){
            if(array_key_exists("langlinks", $page)){
                foreach ($page["langlinks"] as $ar) {
                    if($ar["lang"]=="en"){
                        $english_label = $ar["*"];
                        break;
                    }
                }
            }
        }
        // We create the dbpedia uri.
        $dbpedia_uri = null;
        if($english_label!=null && strpos($english_label, '#')===false){
            $dbpedia_uri = WikiTagUtils::getDbpediaUri($english_label);
        }
        
        $wp_response = array(
            'new_label'=>$new_label,
        	'alternative_label'=>$alternative_label,
        	'status'=>$status,
        	'wikipedia_url'=>$url,
            'wikipedia_alternative_url'=>$alternative_url,
        	'pageid'=>$pageid,
        	'alternative_pageid'=>$alternative_pageid,
        	'dbpedia_uri'=>$dbpedia_uri,
        	'revision_id'=>$revision_id,
        	'response'=>$original_response);
        //return $url." <br/>RES =  ".$res/*." <br/>DUMP =  ".var_dump($pages)*/." <br/>COUNT =  ".count($pages)." <br/>page =  ".var_dump($page);
        return $wp_response;
    }
    

    /**
     *
     * TODO : Enter description here ...
     * @param unknown_type $params
     * @return multitype:unknown mixed
     */
    private static function requestWikipedia($params)
    {
        $params_str = '';
        foreach ($params as $key => $value) {
            if ($params_str==''){
                $params_str = $key.'='.$value;
            }
            else{
                $params_str .= '&'.$key.'='.$value;
            }
        }
        
        $url = WikiTagUtils::$WIKIPEDIA_API_URL.'?'.$params_str;
        
        $ch = curl_init();
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt($ch, CURLOPT_USERAGENT, 'http://www.iri.centrepompidou.fr');
        $res = curl_exec($ch);
        curl_close($ch);
        
        $val = json_decode($res, true);
        $pages = $val["query"]["pages"];
        return array($res, $pages);
    }
    
    /**
     * Returns tag with a null result, usually used after a failed request on Wikipedia
     */
    private static function returnNullResult($response)
    {
        return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response);
    }
    
    /**
     * Returns tag with a null result, usually used after a failed request on Wikipedia
     */
    private static function isHomonymy($page)
    {
        //$s = "";
        foreach ($page["categories"] as $ar) {
            //$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie');
            // Strict test because false can be seen as "O".
            if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){
                //$s .= "TRUE";
                return true;
            }
        }
        return false;
    }
    
    /**
     * Builds DbPedia URI
     */
    private static function getDbpediaUri($english_label)
    {
        return sprintf(WikiTagUtils::$DBPEDIA_URI_TEMPLATE, WikiTagUtils::urlize_for_wikipedia($english_label));
    }
    
    /**
     * URLencode label for wikipedia
     */
    private static function urlize_for_wikipedia($label){
        return urlencode(str_replace(" ", "_", $label));
    }
}