Api mediawiki : a bot creating hyperlink?

116 views Asked by At

I would like to create a bot which would list the pages of my wiki and then search all the pages of the wiki to create hyperlink when he find the name of a page in another page.

I have a page called "Wiki" and in another page there is the word "wiki". So I want to create an hyperlink to redirect toward the page "Wiki".

It's my first time using this API so I'm not sure about how to proceed. I've already found that you can list all your pages with "list=allpages" and search a string in all the wiki with "list=search" but when I have the names of the pages where there is that string, how can I edit just those string in the page?

For now I'm doing it in PHP so can I do something like getting all the content of the page, changing it then edit the page?

1

There are 1 answers

0
Ise On

If you're interested, here is what I did to create this bot :

$path_cookie = "______path________";
$botLogin="Bot";
$botPass="password";
$linkWiki="exemple.com";

if (!file_exists($path_cookie)) touch($path_cookie); //create a file to stay logged in

$curl = curl_init();

function requeteCurl($postfields, $curl, $linkWiki, $path_cookie) //the function you'll just use for each of your requests when logged in
{
    curl_setopt($curl, CURLOPT_URL, $linkWiki);
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($curl, CURLOPT_POST, true);
    curl_setopt($curl, CURLOPT_POSTFIELDS, $postfields);
    curl_setopt($curl, CURLOPT_COOKIEFILE, realpath($path_cookie));
    $resultat = curl_exec($curl);

    return $resultat;
}

    /* First you need to login with your bot */


$postfields = array(
        'action' => 'login',
        'format'=> 'json',
        'lgname' => $botLogin,
        'lgpassword' => $botPass
);

curl_setopt($curl, CURLOPT_URL, $linkWiki);
curl_setopt($curl, CURLOPT_COOKIESESSION, true);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
curl_setopt($curl, CURLOPT_POST, true);
curl_setopt($curl, CURLOPT_POSTFIELDS, $postfields);
curl_setopt($curl, CURLOPT_COOKIEJAR, $path_cookie); //you need to stock your cookies the first time
$connexion=curl_exec($curl); 
if (!$connexion) {
    throw new Exception("Error getting data from server ($linkWiki: " . curl_error($curl));
}
$json_connexion = json_decode($connexion, true);
$tokenConnexion=$json_connexion['login']['token']; //sometime you need to login a second time with the token :
$postfields = array(
        'action' => 'login',
        'format'=> 'json',
        'lgtoken' => $tokenConnexion,
        'lgname' => $botLogin,
        'lgpassword' => $botPass

);

$connexionToken=requeteCurl($postfields, $curl, $linkWiki, $path_cookie);
var_dump($connexionToken);

    /* You have to list all the pages in your wiki to know which strings to search for */

$postfields = array(
            'action' => 'query',
            'format'=> 'json',
            'list' => 'allpages',
            'aplimit' => 'max'
    );
    $pagesWiki=requeteCurl($postfields, $curl, $linkWiki, $path_cookie);
    $json_pagesWikis = json_decode($pagesWiki, true);
    $tabPagesWiki= array();
    $i=0;
    foreach ($json_pagesWikis["query"]["allpages"] as $pages ) { // stock all the names in an array
        $tabNomsPagesWiki[$i] = $pages["title"]; // tableau contenant les noms sont modif des pages
        $i++;
    }

    /* Then you search on all the wiki to find the pages where the string you search is */

foreach ($tabNomsPagesWiki as $chaineRecherchee ) //you use each name as a string to search
{
    $postfields = array(
            'action' => 'query',
            'format'=> 'json',
            'list' => 'search',
            'srsearch' => $chaineRecherchee,
            'srwhat' => 'text',
            'srlimit' => 'max'
    );

    $pagesString = requeteCurl($postfields, $curl, $linkWiki, $path_cookie);
    $json_pagesString = json_decode($pagesString, true);
    $pagesComportantLaRecherche= array(); //and you stock again your results in an array
    $i=0;
    foreach ($json_pagesString["query"]["search"] as $search ) {
        $pagesComportantLaRecherche[$i] = $search["title"] ;
        $i++;
    }

    /* now you have to find your string in the page */

    foreach($pagesComportantLaRecherche as $pageRecherche){ 
    if($pageRecherche != $chaineRecherchee){ //you don't want to do create link to the page in which you are !
            $postfields = array(
                    'action' => 'parse',
                    'format'=> 'json',
                    'page' => $pageRecherche,
                    'prop' => 'wikitext'
            );
            $pageContent=requeteCurl($postfields, $curl, $linkWiki, $path_cookie);
            $json_pagesContent = json_decode($pageContent, true);

            $text_pagesContent = $json_pagesContent["parse"]["wikitext"]["*"] ; //now you have all the content of your page in a var

    /* To find where your string is and replace it with a link you have to first search for the links in the page to not put a link in a link*/

            $stringLien = "[[".$chaineRecherchee."]]"; //that's the string which will replace the one in the text
            $stringLength = strlen($chaineRecherchee); 

            $patternLien = "/((\\[\\[[^\\]]*)[\\s](".$chaineRecherchee.")[\\s\\,\\.][^\\]]*\\]\\])|((\\[[^\\]]*)[\\s\\'](".$chaineRecherchee.")[\\s\\,\\.\\'][^\\]]*\\])/mi"; //a regex to find all the links with your string in it in the page
            preg_match_all($patternLien, $text_pagesContent, $liens,PREG_OFFSET_CAPTURE);
            $patternNomPage = "/[\\s\\']".$chaineRecherchee."[\\s\\,\\.\\']/im"; //now to find just your string
            preg_match_all($patternNomPage, $text_pagesContent, $nomPages,PREG_OFFSET_CAPTURE);         

            $decalage=1;

            foreach ($nomPages[0] as  $page){
                // you need to know the offset of all your strings and your links to compare it
                $offsetNomPagetrouvee = $page[1];       
                $est_dans_lien = false; 
                foreach ($liens[0] as $lien){
                    $lienOffset= $lien[1];      
                    $lienTaille = strlen($lien[0]); 
                    if($lienOffset <= $offsetNomPagetrouvee && $offsetNomPagetrouvee <= $lienOffset+ $lienTaille){
                        $est_dans_lien = true;
                        break;
                    }
                }
                if(!$est_dans_lien){ //if you find a string which is not in a link then you replace it with a link
                    $text_pagesContent = substr_replace($text_pagesContent, $stringLien, $offsetNomPagetrouvee+$decalage, $stringLength);
                    $decalage+=4; //you have to move your offset as you change a string by a link so you add four characters :[[]]
                }

            }


            if($decalage>1){ //if you created some new links, then you edit the page

                $postfields = array(
                        'action' => 'query',
                        'meta' => 'tokens',
                        'format' => 'json'
                );
                $tokenEdit=requeteCurl($postfields, $curl, $linkWiki, $path_cookie);
                $json_tokenEdit = json_decode($tokenEdit, true);
                $text_tokenEdit = $json_tokenEdit['query']['tokens']['csrftoken'];


                $postfields = array(
                        'action' => 'edit',
                        'format' => 'json',
                        'title' => $pageRecherche,
                        'text' => $text_pagesContent,
                        'bot' => '',
                        'token' => $text_tokenEdit
                );
                $edit=requeteCurl($postfields, $curl, $linkWiki, $path_cookie);
                echo "\n".$edit;

            }



        }

    }

}


unlink($path_cookie);

Well, I'm sure there is a lot of unnecessary stuff in this code but I'm not a pro in php and mediawiki and the script run just fine like that so it's not that bad I think ^^