Utils/WikiTagUtils.php
author ymh <ymh.work@gmail.com>
Fri, 26 Sep 2014 10:04:40 +0200
changeset 136 f209fcb0df6c
parent 129 65c12455ce74
permissions -rwxr-xr-x
Added tag V00.18 for changeset ab728cb3632c

<?php
/*
 * This file is part of the WikiTagBundle package.
 *
 * (c) IRI <http://www.iri.centrepompidou.fr/>
 *
 * For the full copyright and license information, please view the LICENSE
 * file that was distributed with this source code.
 */

namespace IRI\Bundle\WikiTagBundle\Utils;

use IRI\Bundle\WikiTagBundle\Entity\Tag;

class WikiTagUtils
{   
    /**
     * Cleans the tag label
     */
    public static function normalizeTag($tag_label)
    {
        if(strlen($tag_label)==0){
            return $tag_label;
        }
        $tag_label = trim($tag_label);//tag.strip()
        $tag_label = str_replace("_", " ", $tag_label);//tag.replace("_", " ")
        $tag_label = preg_replace('/\s+/u', ' ', $tag_label);//" ".join(tag.split())
        $tag_label = ucfirst($tag_label);//tag[0].upper() + tag[1:]
        return $tag_label;
    }
    
    /**
     * Query wikipedia with a normalized label or a pageid
     * return an array with the form
     * array(
     *      'new_label'=>$new_label,
     *   	'alternative_label'=>$alternative_label,
     *   	'status'=>$status,
     *   	'wikipedia_url'=>$url,
     *      'wikipedia_alternative_url'=>$alternative_url,
     *   	'pageid'=>$pageid,
     *   	'alternative_pageid'=>$alternative_pageid,
     *   	'dbpedia_uri'=>$dbpedia_uri,
     *   	'revision_id'=> ,
     *   	'response'=> the original wikipedia json response
     *   	)
     *
     * @param string $tag_label_normalized
     * @param bigint $page_id
     * @return array
     */
    public static function getWikipediaInfo($tag_label_normalized, $page_id=null, $ignore_wikipedia_error=false, $logger = null)
    {

        $params = array('action'=>'query', 'prop'=>'info|categories|langlinks', 'inprop'=>'url', 'lllimit'=>'500', 'cllimit'=>'500', 'rvprop'=>'ids', 'format'=>'json');
        if($tag_label_normalized!=null){
            $params['titles'] = urlencode($tag_label_normalized);
        }
        else if($page_id!=null){
            $params['pageids'] = $page_id;
        }
        else{
            return WikiTagUtils::returnNullResult(null);
        }
        
        try {
            $ar = WikiTagUtils::requestWikipedia($params);
        }
        catch(\Exception $e) {
            if($ignore_wikipedia_error) {
                if(!is_null($logger)) {
                    $logger->err("Error when querying wikipedia : ".$e->getMessage()." with trace : ".$e->getTraceAsString());
                }
                return WikiTagUtils::returnNullResult(null);
            }
            else {
                throw $e;
            }
        }

        $res = $ar[0];
        $original_response = $res;
        $pages = $ar[1];
        // If there 0 or more than 1 result, the query has failed
        if(count($pages)>1 || count($pages)==0){
            return WikiTagUtils::returnNullResult($res);
        }
        // get first result
        $page = reset($pages);
        // Unknow entry ?
        if(array_key_exists('missing', $page) || array_key_exists('invalid', $page)){
            return WikiTagUtils::returnNullResult($res);
        }
        // The entry exists, we get the datas.
        $url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
        $pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
        $new_label = array_key_exists('title', $page) ? $page['title'] : null;
        // We test the status (redirect first because a redirect has no categories key)
        if(array_key_exists('redirect', $page)){
            //return " REDIRECT";
            $status = Tag::$TAG_URL_STATUS_DICT["redirection"];
        }
        else if(WikiTagUtils::isHomonymy($page)){
            //return " HOMONYMY";
            $status = Tag::$TAG_URL_STATUS_DICT["homonyme"];
        }
        else{
            //return " MATCH";
            $status = Tag::$TAG_URL_STATUS_DICT["match"];
        }
        // In redirection, we have to get more datas by adding redirects=true to the params
        $alternative_label = null;
        $alternative_url = null;
        $alternative_pageid = null;
        if($status==Tag::$TAG_URL_STATUS_DICT["redirection"])
        {
            $params['redirects'] = "true";
            try {
                $ar = WikiTagUtils::requestWikipedia($params);
            }
            catch(\Exception $e) {
                if($ignore_wikipedia_error) {
                    if(!is_null($logger)) {
                        $logger->error("Error when querying wikipedia for redirection : ".$e->getMessage()." with trace : ".$e->getTraceAsString());
                    }
                    return WikiTagUtils::returnNullResult(null);
                }
                else {
                    throw $e;
                }
            }
            
            $res = $ar[0];
            $pages = $ar[1];
            // we know that we have at least one answer
            if(count($pages)>1 || count($pages)==0){
                return WikiTagUtils::returnNullResult($res);
            }
            // get first result
            $page = reset($pages);
            $alternative_label = array_key_exists('title', $page) ? $page['title'] : null;
            $alternative_url = array_key_exists('fullurl', $page) ? $page['fullurl'] : null;
            $alternative_pageid = array_key_exists('pageid', $page) ? $page['pageid'] : null;
        }
        
        $revision_id = $page['lastrevid'];
        
        // Get the dbpedia uri by requesting dbpedia with sparql
        $dbpedia_uri = WikiTagUtils::getDbpediaUri($new_label);
        
        $wp_response = array(
            'new_label'=>$new_label,
        	'alternative_label'=>$alternative_label,
        	'status'=>$status,
        	'wikipedia_url'=>$url,
            'wikipedia_alternative_url'=>$alternative_url,
        	'pageid'=>$pageid,
        	'alternative_pageid'=>$alternative_pageid,
        	'dbpedia_uri'=>$dbpedia_uri,
        	'revision_id'=>$revision_id,
        	'response'=>$original_response);
        
        return $wp_response;
    }
    
    /**
     * Generic curl request
     *
     * @param string $url
     * @return object (json decoded)
     */
    private static function curlRequest($url, $throw_error=true)
    {
    	$ch = curl_init();
    	curl_setopt($ch, CURLOPT_URL, $url);
    	curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
    	// default values
    	curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:26.0) Gecko/20100101 Firefox/26.0');
    	curl_setopt($ch, CURLOPT_TIMEOUT_MS, 5000);
    	// Set options if they are set in the config.yml file, typically for proxy configuration.
    	// Thanks to the configuration file, it will execute commands like "curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);" or "curl_setopt($ch, CURLOPT_PROXY, "xxx.yyy.zzz:PORT");"
    	$curl_options = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.curl_options");
    	foreach ($curl_options as $key => $value) {
    		if(strtoupper($value)=='TRUE'){
    			$value = TRUE;
    		}
    		else if (strtoupper($value)=='FALSE'){
    			$value = FALSE;
    		}
    		else if (defined($value)){
    			$value = constant($value);
    		}
    		curl_setopt($ch, constant($key), $value);
    	}
    	// end of treatment
    	$res = curl_exec($ch);
    	$curl_errno = curl_errno($ch);
    	$curl_error = curl_error($ch);
    	curl_close($ch);
    
    	if ($curl_errno > 0 && $throw_error) {
    		throw new \Exception("$url\n request failed. cURLError #$curl_errno: $curl_error\n", $curl_errno, null);
    	}
    	
    	return $res;
	}
    

    /**
     * build and do the request to Wikipedia.
     *
     * @param array $params
     * @return array
     */
    private static function requestWikipedia($params)
    {
        $params_str = '';
        foreach ($params as $key => $value) {
            if ($params_str==''){
                $params_str = $key.'='.$value;
            }
            else{
                $params_str .= '&'.$key.'='.$value;
            }
        }
        
        $url_templates = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates");
        $url = $url_templates["wikipedia_api"].'?'.$params_str;
        
        $res = WikiTagUtils::curlRequest($url);
        $val = json_decode($res, true);
        $pages = $val["query"]["pages"];
        return array($res, $pages);
    }
    
    /**
     * Returns tag with a null result, usually used after a failed request on Wikipedia
     */
    private static function returnNullResult($response)
    {
        return array('new_label'=>null, 'status'=>Tag::$TAG_URL_STATUS_DICT['null_result'], 'wikipedia_url'=>null, 'pageid'=>null, 'dbpedia_uri'=>null, 'revision_id'=>null, 'response'=>$response);
    }
    
    /**
     * Returns tag with a null result, usually used after a failed request on Wikipedia
     */
    private static function isHomonymy($page)
    {
        //$s = "";
        foreach ($page["categories"] as $ar) {
            //$s .= ", b : ".$ar." - title = ".$ar["title"].", strpos = ".strpos($ar["title"], 'Catégorie:Homonymie');
            // Strict test because false can be seen as "O".
            if(strpos($ar["title"], 'Catégorie:Homonymie')!==false || strpos($ar["title"], 'Category:Disambiguation')!==false){
                //$s .= "TRUE";
                return true;
            }
        }
        return false;
    }
    
    /**
     * Builds DbPedia URI
     */
    public static function getDbpediaUri($label, $params=array(), $throw_error=true, $req_param="label")
    {
    	// Get lang from url
    	$url_templates = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates");
    	$dbp_url = $url_templates["dbpedia_sparql"];
    	$lang = substr($dbp_url, 7, 2);
    	// filter with regexp to avoid results with "category:LABEL" or other "abc:LABEL"
    	$query = 'select distinct * where { ?s rdfs:label "'.$label.'"@'.$lang.' . FILTER (regex(?s, "^http\\\\://[^:]+$")) }';
    	if($req_param=="pageid"){
    		$query = 'select distinct * where { ?s dbpedia-owl:wikiPageID '.$label.' }';
    	}
    	elseif ($req_param=="wikiurl"){
    		$query = 'select distinct * where { ?s foaf:isPrimaryTopicOf <'.$label.'> }';
    	}
    	
    	$params = array(
    		"query" => $query,
    		"format" => 'application/json',
    	);
    	
    	$params_str = '';
    	foreach ($params as $key => $value) {
    		if ($params_str==''){
    			$params_str = $key.'='.urlencode($value);
    		}
    		else{
    			$params_str .= '&'.$key.'='.urlencode($value);
    		}
    	}
    	
    	$url_templates = $GLOBALS["kernel"]->getContainer()->getParameter("wiki_tag.url_templates");
    	$url = $url_templates["dbpedia_sparql"].'?'.$params_str;
    	
    	$res = WikiTagUtils::curlRequest($url, $throw_error);
    	$val = json_decode($res, true);
    	$uri = "";
    	if($val){
	    	if(array_key_exists("results", $val)){
	    		if(array_key_exists("bindings", $val["results"])){
	    			$len = count($val["results"]["bindings"]);
	    			if($len > 0){
	    				$uri = $val["results"]["bindings"][0]["s"]["value"];
	    				if($len>1){
	    					// If there are several results, we test the "url label" to see if it matches the label.
	    					// Why ? Because, for example "1000" gets "Category:1000" and "1000" as result.
	    					// We keep this code to be safe but the regexp in the sparql request normally avoids this problem.
	    					for($i=0;$i<$len;$i++){
	    						$res_uri = $val["results"]["bindings"][$i]["s"]["value"];
	    						$url_label = substr( $res_uri, strrpos( $res_uri, '/' )+1 );
	    						if(str_replace(" ", "_", $label) == $url_label){
	    							$uri = $res_uri;
	    						}
	    					}
	    				}
	    			}
	    		}
	    	}
    	}
    	return $uri;
    }
    
    /**
     * URLencode label for wikipedia
     */
    private static function urlize_for_wikipedia($label){
        return urlencode(str_replace(" ", "_", $label));
    }
}