

DMOZコンテンツ/構造XMLファイルをMySQLに解析しようとしていますが、これを行うための既存のスクリプトはすべて非常に古く、うまく機能しません。解析のためにPHPで大きな(+ 1GB)XMLファイルを開くにはどうすればよいですか?


大きなファイルの処理に本当に適しているphp APIは2つだけです。最初は古い expat apiで、2番目は新しい XMLreader 関数です。これらのAPIは、ツリー全体をメモリにロードするのではなく、連続ストリームを読み取ります(これはsimplexmlとDOMが行うことです)。



class SimpleDMOZParser
    protected $_stack = array();
    protected $_file = "";
    protected $_parser = null;

    protected $_currentId = "";
    protected $_current = "";

    public function __construct($file)
        $this->_file = $file;

        $this->_parser = xml_parser_create("UTF-8");
        xml_set_object($this->_parser, $this);
        xml_set_element_handler($this->_parser, "startTag", "endTag");

    public function startTag($parser, $name, $attribs)
        array_Push($this->_stack, $this->_current);

        if ($name == "TOPIC" && count($attribs)) {
            $this->_currentId = $attribs["R:ID"];

        if ($name == "LINK" && strpos($this->_currentId, "Top/Home/Consumer_Information/Electronics/") === 0) {
            echo $attribs["R:RESOURCE"] . "\n";

        $this->_current = $name;

    public function endTag($parser, $name)
        $this->_current = array_pop($this->_stack);

    public function parse()
        $fh = fopen($this->_file, "r");
        if (!$fh) {
            die("Epic fail!\n");

        while (!feof($fh)) {
            $data = fread($fh, 4096);
            xml_parse($this->_parser, $data, feof($fh));

$parser = new SimpleDMOZParser("content.rdf.u8");
Emil H

これは PHPで大きなXMLを処理する最良の方法 と非常によく似た質問ですが、DMOZカタログ解析の特定の問題に対処する非常に優れた具体的な回答があります。ただし、これは一般に大きなXMLにとってGoogleの良いヒットであるため、他の質問からの回答も再投稿します。



ファイルのストリーミング中にすべての子をXMLルート要素に抽出する単純なクラス。 pubmed.comの108 MB XMLファイルでテスト済み。

class SimpleXmlStreamer extends XmlStreamer {
    public function processNode($xmlString, $elementName, $nodeIndex) {
        $xml = simplexml_load_string($xmlString);

        // Do something with your SimpleXML object

        return true;

$streamer = new SimpleXmlStreamer("myLargeXmlFile.xml");



<?xml version="1.0" encoding="UTF-8"?>
    <Title>Title 1</Title>
    <Name>It's name goes here</Name>




$file = new Chunk('complex-test.xml', array('element' => 'Object'));

while ($xml = $file->read()) {
  $obj = simplexml_load_string($xml);
  // do some parsing, insert to DB whatever

Class File

 * Chunk
 * Reads a large file in as chunks for easier parsing.
 * The chunks returned are whole <$this->options['element']/>s found within file.
 * Each call to read() returns the whole element including start and end tags.
 * Tested with a 1.8MB file, extracted 500 elements in 0.11s
 * (with no work done, just extracting the elements)
 * Usage:
 * <code>
 *   // initialize the object
 *   $file = new Chunk('chunk-test.xml', array('element' => 'Chunk'));
 *   // loop through the file until all lines are read
 *   while ($xml = $file->read()) {
 *     // do whatever you want with the string
 *     $o = simplexml_load_string($xml);
 *   }
 * </code>
 * @package default
 * @author Dom Hastings
class Chunk {
   * options
   * @var array Contains all major options
   * @access public
  public $options = array(
    'path' => './',       // string The path to check for $file in
    'element' => '',      // string The XML element to return
    'chunkSize' => 512    // integer The amount of bytes to retrieve in each chunk

   * file
   * @var string The filename being read
   * @access public
  public $file = '';
   * pointer
   * @var integer The current position the file is being read from
   * @access public
  public $pointer = 0;

   * handle
   * @var resource The fopen() resource
   * @access private
  private $handle = null;
   * reading
   * @var boolean Whether the script is currently reading the file
   * @access private
  private $reading = false;
   * readBuffer
   * @var string Used to make sure start tags aren't missed
   * @access private
  private $readBuffer = '';

   * __construct
   * Builds the Chunk object
   * @param string $file The filename to work with
   * @param array $options The options with which to parse the file
   * @author Dom Hastings
   * @access public
  public function __construct($file, $options = array()) {
    // merge the options together
    $this->options = array_merge($this->options, (is_array($options) ? $options : array()));

    // check that the path ends with a /
    if (substr($this->options['path'], -1) != '/') {
      $this->options['path'] .= '/';

    // normalize the filename
    $file = basename($file);

    // make sure chunkSize is an int
    $this->options['chunkSize'] = intval($this->options['chunkSize']);

    // check it's valid
    if ($this->options['chunkSize'] < 64) {
      $this->options['chunkSize'] = 512;

    // set the filename
    $this->file = realpath($this->options['path'].$file);

    // check the file exists
    if (!file_exists($this->file)) {
      throw new Exception('Cannot load file: '.$this->file);

    // open the file
    $this->handle = fopen($this->file, 'r');

    // check the file opened successfully
    if (!$this->handle) {
      throw new Exception('Error opening file for reading');

   * __destruct
   * Cleans up
   * @return void
   * @author Dom Hastings
   * @access public
  public function __destruct() {
    // close the file resource

   * read
   * Reads the first available occurence of the XML element $this->options['element']
   * @return string The XML string from $this->file
   * @author Dom Hastings
   * @access public
  public function read() {
    // check we have an element specified
    if (!empty($this->options['element'])) {
      // trim it
      $element = trim($this->options['element']);

    } else {
      $element = '';

    // initialize the buffer
    $buffer = false;

    // if the element is empty
    if (empty($element)) {
      // let the script know we're reading
      $this->reading = true;

      // read in the whole doc, cos we don't know what's wanted
      while ($this->reading) {
        $buffer .= fread($this->handle, $this->options['chunkSize']);

        $this->reading = (!feof($this->handle));

      // return it all
      return $buffer;

    // we must be looking for a specific element
    } else {
      // set up the strings to find
      $open = '<'.$element.'>';
      $close = '</'.$element.'>';

      // let the script know we're reading
      $this->reading = true;

      // reset the global buffer
      $this->readBuffer = '';

      // this is used to ensure all data is read, and to make sure we don't send the start data again by mistake
      $store = false;

      // seek to the position we need in the file
      fseek($this->handle, $this->pointer);

      // start reading
      while ($this->reading && !feof($this->handle)) {
        // store the chunk in a temporary variable
        $tmp = fread($this->handle, $this->options['chunkSize']);

        // update the global buffer
        $this->readBuffer .= $tmp;

        // check for the open string
        $checkOpen = strpos($tmp, $open);

        // if it wasn't in the new buffer
        if (!$checkOpen && !($store)) {
          // check the full buffer (in case it was only half in this buffer)
          $checkOpen = strpos($this->readBuffer, $open);

          // if it was in there
          if ($checkOpen) {
            // set it to the remainder
            $checkOpen = $checkOpen % $this->options['chunkSize'];

        // check for the close string
        $checkClose = strpos($tmp, $close);

        // if it wasn't in the new buffer
        if (!$checkClose && ($store)) {
          // check the full buffer (in case it was only half in this buffer)
          $checkClose = strpos($this->readBuffer, $close);

          // if it was in there
          if ($checkClose) {
            // set it to the remainder plus the length of the close string itself
            $checkClose = ($checkClose + strlen($close)) % $this->options['chunkSize'];

        // if it was
        } elseif ($checkClose) {
          // add the length of the close string itself
          $checkClose += strlen($close);

        // if we've found the opening string and we're not already reading another element
        if ($checkOpen !== false && !($store)) {
          // if we're found the end element too
          if ($checkClose !== false) {
            // append the string only between the start and end element
            $buffer .= substr($tmp, $checkOpen, ($checkClose - $checkOpen));

            // update the pointer
            $this->pointer += $checkClose;

            // let the script know we're done
            $this->reading = false;

          } else {
            // append the data we know to be part of this element
            $buffer .= substr($tmp, $checkOpen);

            // update the pointer
            $this->pointer += $this->options['chunkSize'];

            // let the script know we're gonna be storing all the data until we find the close element
            $store = true;

        // if we've found the closing element
        } elseif ($checkClose !== false) {
          // update the buffer with the data upto and including the close tag
          $buffer .= substr($tmp, 0, $checkClose);

          // update the pointer
          $this->pointer += $checkClose;

          // let the script know we're done
          $this->reading = false;

        // if we've found the closing element, but half in the previous chunk
        } elseif ($store) {
          // update the buffer
          $buffer .= $tmp;

          // and the pointer
          $this->pointer += $this->options['chunkSize'];

    // return the element (or the whole file if we're not looking for elements)
    return $buffer;
Mihir Rawal






率直に言って、PHP <5.1.2。5.1.2以降では、XMLReaderを使用できます。これはおそらく最良のオプションですが、その前に、上記のチャンク戦略、または古いSAX/expat libのいずれかに固執しました。そして、私はあなたの残りの部分については知りませんが、SAX/expatパーサーの作成/保守は嫌いです。

ただし、ドキュメントdoes n'tが多くの同一の最下位要素で構成されている場合、このアプローチは実際には実用的ではないことに注意してください(たとえば、あらゆる種類のファイルのリストやURLなどに最適です) 。、ただし、大きなHTMLドキュメントの解析には意味がありません)

Frank Farmer


PHPでのSAXの使用に関する情報: http://www.brainbell.com/tutorials/php/Parsing_XML_With_SAX.htm

Tetsujin no Oni

このために、XMLReaderとDOMを組み合わせることができます。 In PHP両方のAPI(およびSimpleXML)は同じライブラリ-libxml2に基づいています。通常、大きなXMLはレコードのリストです。XMLReaderを使用してレコードを反復し、単一のレコードをDOMに読み込みます。 DOMメソッドとXpathを使用して値を抽出するキーはXMLReader::expand()メソッドで、XMLReaderインスタンスの現在のノードとその子孫をDOMノードとしてロードします。


    <title isbn="978-0596100087">XSLT 1.0 Pocket Reference</title>
    <title isbn="978-0596100506">XML Pocket Reference</title>
  <!-- ... -->


// open the XML file
$reader = new XMLReader();

// prepare a DOM document
$document = new DOMDocument();
$xpath = new DOMXpath($document);

// find the first `book` element node at any depth
while ($reader->read() && $reader->localName !== 'book') {

// as long as here is a node with the name "book"
while ($reader->localName === 'book') {
  // expand the node into the prepared DOM
  $book = $reader->expand($document);
  // use Xpath expressions to fetch values
    $xpath->evaluate('string(title/@isbn)', $book),
    $xpath->evaluate('string(title)', $book)
  // move to the next book sibling node

展開されたノードはDOMドキュメントに追加されないことに注意してください。 GCでクリーンアップできます。


$namespaceURI = 'urn:example-books';

$reader = new XMLReader();

$document = new DOMDocument();
$xpath = new DOMXpath($document);
// register a prefix for the Xpath expressions
$xpath->registerNamespace('b', $namespaceURI);

// compare local node name and namespace URI
while (
  $reader->read() &&
    $reader->localName !== 'book' ||
    $reader->namespaceURI !== $namespaceURI
) {

// iterate the book elements 
while ($reader->localName === 'book') {
  // validate that they are in the namespace
  if ($reader->namespaceURI === $namespaceURI) {
    $book = $reader->expand($document);
      $xpath->evaluate('string(b:title/@isbn)', $book),
      $xpath->evaluate('string(b:title)', $book)




$xmlFile = 'the_LARGE_xml_file_to_load.xml'
$primEL  = 'the_name_of_your_element';

$xml     = new XMLReader();

// finding first primary element to work with
while($xml->read() && $xml->name != $primEL){;}

// looping through elements
while($xml->name == $primEL) {
    // loading element data into simpleXML object
    $element = new SimpleXMLElement($xml->readOuterXML());

    // DO STUFF

    // moving pointer   
    // clearing current element
} // end while
