mirror of
https://github.com/YunoHost-Apps/mediawiki_ynh.git
synced 2024-09-03 19:46:05 +02:00
206 lines
5.4 KiB
PHP
206 lines
5.4 KiB
PHP
<?php
|
|
/**
|
|
* Helper class for the --prefetch option of dumpTextPass.php
|
|
*
|
|
* Copyright © 2005 Brion Vibber <brion@pobox.com>
|
|
* http://www.mediawiki.org/
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License along
|
|
* with this program; if not, write to the Free Software Foundation, Inc.,
|
|
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
* http://www.gnu.org/copyleft/gpl.html
|
|
*
|
|
* @file
|
|
* @ingroup Maintenance
|
|
*/
|
|
|
|
/**
|
|
* Readahead helper for making large MediaWiki data dumps;
|
|
* reads in a previous XML dump to sequentially prefetch text
|
|
* records already normalized and decompressed.
|
|
*
|
|
* This can save load on the external database servers, hopefully.
|
|
*
|
|
* Assumes that dumps will be recorded in the canonical order:
|
|
* - ascending by page_id
|
|
* - ascending by rev_id within each page
|
|
* - text contents are immutable and should not change once
|
|
* recorded, so the previous dump is a reliable source
|
|
*
|
|
* @ingroup Maintenance
|
|
*/
|
|
class BaseDump {
|
|
var $reader = null;
|
|
var $atEnd = false;
|
|
var $atPageEnd = false;
|
|
var $lastPage = 0;
|
|
var $lastRev = 0;
|
|
var $infiles = null;
|
|
|
|
function BaseDump( $infile ) {
|
|
$this->infiles = explode( ';', $infile );
|
|
$this->reader = new XMLReader();
|
|
$infile = array_shift( $this->infiles );
|
|
if ( defined( 'LIBXML_PARSEHUGE' ) ) {
|
|
$this->reader->open( $infile, null, LIBXML_PARSEHUGE );
|
|
}
|
|
else {
|
|
$this->reader->open( $infile );
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Attempts to fetch the text of a particular page revision
|
|
* from the dump stream. May return null if the page is
|
|
* unavailable.
|
|
*
|
|
* @param $page Integer: ID number of page to read
|
|
* @param $rev Integer: ID number of revision to read
|
|
* @return string or null
|
|
*/
|
|
function prefetch( $page, $rev ) {
|
|
$page = intval( $page );
|
|
$rev = intval( $rev );
|
|
while ( $this->lastPage < $page && !$this->atEnd ) {
|
|
$this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
|
|
$this->nextPage();
|
|
}
|
|
if ( $this->lastPage > $page || $this->atEnd ) {
|
|
$this->debug( "BaseDump::prefetch already past page $page looking for rev $rev [$this->lastPage, $this->lastRev]" );
|
|
return null;
|
|
}
|
|
while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
|
|
$this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
|
|
$this->nextRev();
|
|
}
|
|
if ( $this->lastRev == $rev && !$this->atEnd ) {
|
|
$this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
|
|
return $this->nextText();
|
|
} else {
|
|
$this->debug( "BaseDump::prefetch already past rev $rev on page $page [$this->lastPage, $this->lastRev]" );
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function debug( $str ) {
|
|
wfDebug( $str . "\n" );
|
|
// global $dumper;
|
|
// $dumper->progress( $str );
|
|
}
|
|
|
|
/**
|
|
* @access private
|
|
*/
|
|
function nextPage() {
|
|
if ( $this->skipTo( 'page', 'mediawiki' ) ) {
|
|
if ( $this->skipTo( 'id' ) ) {
|
|
$this->lastPage = intval( $this->nodeContents() );
|
|
$this->lastRev = 0;
|
|
$this->atPageEnd = false;
|
|
}
|
|
} else {
|
|
$this->close();
|
|
if ( count( $this->infiles ) ) {
|
|
$infile = array_shift( $this->infiles );
|
|
$this->reader->open( $infile );
|
|
$this->atEnd = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @access private
|
|
*/
|
|
function nextRev() {
|
|
if ( $this->skipTo( 'revision' ) ) {
|
|
if ( $this->skipTo( 'id' ) ) {
|
|
$this->lastRev = intval( $this->nodeContents() );
|
|
}
|
|
} else {
|
|
$this->atPageEnd = true;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* @access private
|
|
* @return string
|
|
*/
|
|
function nextText() {
|
|
$this->skipTo( 'text' );
|
|
return strval( $this->nodeContents() );
|
|
}
|
|
|
|
/**
|
|
* @access private
|
|
* @param $name string
|
|
* @param $parent string
|
|
* @return bool|null
|
|
*/
|
|
function skipTo( $name, $parent = 'page' ) {
|
|
if ( $this->atEnd ) {
|
|
return false;
|
|
}
|
|
while ( $this->reader->read() ) {
|
|
if ( $this->reader->nodeType == XMLReader::ELEMENT &&
|
|
$this->reader->name == $name ) {
|
|
return true;
|
|
}
|
|
if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
|
|
$this->reader->name == $parent ) {
|
|
$this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
|
|
return false;
|
|
}
|
|
}
|
|
return $this->close();
|
|
}
|
|
|
|
/**
|
|
* Shouldn't something like this be built-in to XMLReader?
|
|
* Fetches text contents of the current element, assuming
|
|
* no sub-elements or such scary things.
|
|
*
|
|
* @return String
|
|
* @access private
|
|
*/
|
|
function nodeContents() {
|
|
if ( $this->atEnd ) {
|
|
return null;
|
|
}
|
|
if ( $this->reader->isEmptyElement ) {
|
|
return "";
|
|
}
|
|
$buffer = "";
|
|
while ( $this->reader->read() ) {
|
|
switch ( $this->reader->nodeType ) {
|
|
case XMLReader::TEXT:
|
|
// case XMLReader::WHITESPACE:
|
|
case XMLReader::SIGNIFICANT_WHITESPACE:
|
|
$buffer .= $this->reader->value;
|
|
break;
|
|
case XMLReader::END_ELEMENT:
|
|
return $buffer;
|
|
}
|
|
}
|
|
return $this->close();
|
|
}
|
|
|
|
/**
|
|
* @access private
|
|
* @return null
|
|
*/
|
|
function close() {
|
|
$this->reader->close();
|
|
$this->atEnd = true;
|
|
return null;
|
|
}
|
|
}
|