<?php
/**
 * Helper class for the --prefetch option of dumpTextPass.php
 *
 * Copyright © 2005 Brion Vibber <brion@pobox.com>
 * http://www.mediawiki.org/
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License along
 * with this program; if not, write to the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
 * http://www.gnu.org/copyleft/gpl.html
 *
 * @file
 * @ingroup Maintenance
 */

/**
 * Readahead helper for making large MediaWiki data dumps;
 * reads in a previous XML dump to sequentially prefetch text
 * records already normalized and decompressed.
 *
 * This can save load on the external database servers, hopefully.
 *
 * Assumes that dumps will be recorded in the canonical order:
 * - ascending by page_id
 * - ascending by rev_id within each page
 * - text contents are immutable and should not change once
 *   recorded, so the previous dump is a reliable source
 *
 * @ingroup Maintenance
 */
class BaseDump {
	var $reader = null;
	var $atEnd = false;
	var $atPageEnd = false;
	var $lastPage = 0;
	var $lastRev = 0;
	var $infiles = null;

	function BaseDump( $infile ) {
		$this->infiles = explode( ';', $infile );
		$this->reader = new XMLReader();
		$infile = array_shift( $this->infiles );
		if ( defined( 'LIBXML_PARSEHUGE' ) ) {
			$this->reader->open( $infile, null, LIBXML_PARSEHUGE );
		}
		else {
			$this->reader->open( $infile );
		}
	}

	/**
	 * Attempts to fetch the text of a particular page revision
	 * from the dump stream. May return null if the page is
	 * unavailable.
	 *
	 * @param $page Integer: ID number of page to read
	 * @param $rev Integer: ID number of revision to read
	 * @return string or null
	 */
	function prefetch( $page, $rev ) {
		$page = intval( $page );
		$rev = intval( $rev );
		while ( $this->lastPage < $page && !$this->atEnd ) {
			$this->debug( "BaseDump::prefetch at page $this->lastPage, looking for $page" );
			$this->nextPage();
		}
		if ( $this->lastPage > $page || $this->atEnd ) {
			$this->debug( "BaseDump::prefetch already past page $page looking for rev $rev  [$this->lastPage, $this->lastRev]" );
			return null;
		}
		while ( $this->lastRev < $rev && !$this->atEnd && !$this->atPageEnd ) {
			$this->debug( "BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, looking for $page, $rev" );
			$this->nextRev();
		}
		if ( $this->lastRev == $rev && !$this->atEnd ) {
			$this->debug( "BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]" );
			return $this->nextText();
		} else {
			$this->debug( "BaseDump::prefetch already past rev $rev on page $page  [$this->lastPage, $this->lastRev]" );
			return null;
		}
	}

	function debug( $str ) {
		wfDebug( $str . "\n" );
		// global $dumper;
		// $dumper->progress( $str );
	}

	/**
	 * @access private
	 */
	function nextPage() {
		if ( $this->skipTo( 'page', 'mediawiki' ) ) {
			if ( $this->skipTo( 'id' ) ) {
				$this->lastPage = intval( $this->nodeContents() );
				$this->lastRev = 0;
				$this->atPageEnd = false;
			}
		} else {
			$this->close();
			if ( count( $this->infiles ) ) {
				$infile = array_shift( $this->infiles );
				$this->reader->open( $infile );
				$this->atEnd = false;
			}
		}
	}

	/**
	 * @access private
	 */
	function nextRev() {
		if ( $this->skipTo( 'revision' ) ) {
			if ( $this->skipTo( 'id' ) ) {
				$this->lastRev = intval( $this->nodeContents() );
			}
		} else {
			$this->atPageEnd = true;
		}
	}

	/**
	 * @access private
	 * @return string
	 */
	function nextText() {
		$this->skipTo( 'text' );
		return strval( $this->nodeContents() );
	}

	/**
	 * @access private
	 * @param $name string
	 * @param $parent string
	 * @return bool|null
	 */
	function skipTo( $name, $parent = 'page' ) {
		if ( $this->atEnd ) {
			return false;
		}
		while ( $this->reader->read() ) {
			if ( $this->reader->nodeType == XMLReader::ELEMENT &&
				$this->reader->name == $name ) {
				return true;
			}
			if ( $this->reader->nodeType == XMLReader::END_ELEMENT &&
				$this->reader->name == $parent ) {
				$this->debug( "BaseDump::skipTo found </$parent> searching for <$name>" );
				return false;
			}
		}
		return $this->close();
	}

	/**
	 * Shouldn't something like this be built-in to XMLReader?
	 * Fetches text contents of the current element, assuming
	 * no sub-elements or such scary things.
	 *
	 * @return String
	 * @access private
	 */
	function nodeContents() {
		if ( $this->atEnd ) {
			return null;
		}
		if ( $this->reader->isEmptyElement ) {
			return "";
		}
		$buffer = "";
		while ( $this->reader->read() ) {
			switch ( $this->reader->nodeType ) {
			case XMLReader::TEXT:
//			case XMLReader::WHITESPACE:
			case XMLReader::SIGNIFICANT_WHITESPACE:
				$buffer .= $this->reader->value;
				break;
			case XMLReader::END_ELEMENT:
				return $buffer;
			}
		}
		return $this->close();
	}

	/**
	 * @access private
	 * @return null
	 */
	function close() {
		$this->reader->close();
		$this->atEnd = true;
		return null;
	}
}