<?php
/**
* Web Application Component Toolkit
*
* @link http://www.phpwact.org/
*
* @author Wact Development Team
* @link http://www.phpwact.org/team
*
* @copyright Copyright 2006, Jeff Moore
* @license http://opensource.org/licenses/mit-license.php MIT
*/

/**
* HTML/XHTML parser that robustly scans a text string and outputs a
* stream of SAX events.
*
* This class is designed primarily to parse HTML fragments.  Intended usage
* scenarios include:
*   - A basis for santizing HTML strings
*   - Recognizing XML style markup embedded in text documents
*   - Implementing HTML-like markup languages
*
* This class emits events in a custom SAX API that is designed to represent
* HTML documents as well as XML documents.  However, this class should be geared
* primarly to HTML markup.
*
* As a general strategy, markup events are emitted only for valid markup,
* although sequence of events may not constitute a well-formed document.
*
* There is no current definition of what is valid HTML markup and what is not.
* (The HTML 4.0.1 spec claims a relationship to the SGML specification which
* is not helpful for this application.)  The WHAT working group 
* (http://whatwg.org/) is working on such a specification and this parser 
* should track the parsing recommendations of that group as much as possible.
*
* If potential markup is encountered that the parser does not understand, it
* is passed through in the form of character data.
*
* Entities are not parsed and instead passed through unaltered.
* Character data and attribute values may contain the <, &, and > characters.
* Minimized boolean attributes are allowed.
* Attribute values without quotation marks are allowed.
* Comments are parsed SGML style
* 
* The text string to be parsed must be UTF-8.
*
* PHP CODE:
* This parser has no knowledge of embedded PHP code.  If you want to process
* a document with embedded PHP code, use the php tokenizer to recognize the
* php and replace it with unique entities.  The entities can be replaced later
* in the event stream.  These entities may still cause problems for this parser
* if they appear at places other than attribute values or character data.
*/
// Rename to reader
class Wact_Html_Sax_Parser implements Wact_Html_Sax_Locator {

    /**
    * An identifier for this document
    */
    var $publicId;
   
    /**
    * A class observing content events emited by this parser.
    * @var Wact_Html_Sax_Handler event handler
    */
    var $handler = NULL;
    
    /**
    * text document being parsed
    * @var string
    */
    var $rawtext;
    
    /**
    * Current position in document relative to start (0)
    * @var int
    */
    var $position;
    var $charStart;
    var $markupStart;
    
    /**
    * Length of the document in characters
    * @var int
    */
    var $length;


    /**
    */
    function setHandler($obj) {
        $this->handler = $obj;
    }

    /**
    * Calculates the line number from the byte index
    * @return int the current line number
    */
    function getLineNumber() {
        return 1 + substr_count(substr($this->rawtext, 0, $this->position), "\n");
    }

    /**
    */
    function getPublicId() {
        return $this->publicId;
    }

    /**
    * Calculates the column number from the byte index
    * @return int the current line number
    */
    function getColumnNumber() {
        // Not implemented yet.
    }

    /**
    * emit characters event
    */
    function getCharacterOffset() {
        return $this->position;
    }
    
    /**
    * 
    */
    function getRawEventString() {
        return substr($this->rawtext, $this->markupStart, $this->position-$this->markupStart);
    }

    /**
    * emit characters event
    */
    function emitCharacters() {
        if ($this->markupStart > $this->charStart) {
            $this->handler->characters(substr($this->rawtext, $this->charStart, $this->markupStart - $this->charStart));
        }
        $this->charStart = $this->position;
    }

    /**
    * Begins the parsing operation, setting up any decorators, depending on
    * parse options invoking _parse() to execute parsing
    * @param string XML document to parse
    * @return void
    */
    function parse($data, $publicId = NULL) {
        $this->rawtext = $data;
        $this->length = strlen($data);
        $this->position = 0;
        $this->charStart = 0;
        $this->publicId = $publicId;

        $this->handler->startDocument($this, 'UTF-8');

        $nameStartChar = ':_a-zA-Z\xC0-\xD6\xD8-\xF6\xF8-\x{2FF}\x{370}-\x{37D}\x{37F}-\x{1FFF}\x{200C}-\x{200D}\x{2070}-\x{218F}\x{2C00}-\x{2FEF}\x{3001}-\x{D7FF}\x{F900}-\x{FDCF}\x{FDF0}-\x{FFFD}';
        $nameChar = $nameStartChar . '-.0-9\x{b7}\x{0300}-\x{036f}\x{203f}-\x{2040}'; 

        $namePattern = '[' . $nameStartChar . '][' . $nameChar . ']*';
        
        $markupStartPattern = '/<\/|<!|<[' . $nameStartChar . ']/u';
        $endElementPattern = '/\G(' . $namePattern. ')\s*>/u';
        $startElementPattern = '/\G(' . $namePattern. ')(?=\s|>|\/>)/u';
        $startElementEndPattern = '/\G\s*(\/)?>/u';
        $attributePattern = '/\G\s*(' . $namePattern . ')(\s*=\s*("|\'|)(.*?)\3){0,1}(?=\s|\/>|>)/us';
        $commentPattern = '/\G--(.*?)--\s*/us';
        $commentDeclEndPattern ='/\G>/u';
        
        $cdataPattern = '/\G\[CDATA\[(.*)\]\]\>/us';

        do {
            if (!preg_match($markupStartPattern, $this->rawtext, $matches, PREG_OFFSET_CAPTURE, $this->position)) {
                break;
            }

            $this->markupStart = $matches[0][1];
            $this->position = $this->markupStart + 2;

            switch($matches[0][0]) {
            case '</':
                if (!preg_match($endElementPattern, $this->rawtext, $matches, PREG_OFFSET_CAPTURE, $this->position)) {
                    break;
                }

                $tag = $matches[1][0];
                $this->position += strlen($matches[0][0]);
               
                $this->emitCharacters();
                $this->handler->endElement($tag);

                break;
            case '<!':
                if (preg_match_all($commentPattern, $this->rawtext, $matches, PREG_SET_ORDER, $this->position)) {
                    $comments = array();
                    foreach($matches as $match) {
                        $this->position += strlen($match[0]);
                        $comments[] = $match[1];
                    }
                    if (!preg_match($commentDeclEndPattern, $this->rawtext, $matches, NULL, $this->position)) {
                        break;
                    }
                    $this->position += strlen($matches[0]);
                    $this->emitCharacters();
                    if (count($comments) == 1) {
                        $this->handler->comment($comments[0]);
                    } else {
                        $this->handler->comment($comments);
                    }
                    break;
                }
                if (preg_match($cdataPattern, $this->rawtext, $matches, PREG_OFFSET_CAPTURE, $this->position)) {
                    $cdata = $matches[1][0];
                    $this->position += strlen($matches[0][0]);
                    
                    $this->emitCharacters();
                    
                    $this->handler->cdata($cdata);
                }

                break;

            default:
                $this->position--; // We overcaptured
                if (!preg_match($startElementPattern, $this->rawtext, $matches, PREG_OFFSET_CAPTURE, $this->position)) {
                    break;
                }
                $tag = $matches[1][0];
                $attributes = array();
                $this->position += strlen($matches[0][0]);
    
                if (preg_match_all($attributePattern, $this->rawtext, $matches, PREG_SET_ORDER, $this->position)) {
                    foreach($matches as $match) {
                        $this->position += strlen($match[0]);
                        $name = $match[1];
                        if (isset($match[4])) {
                            $value = $match[4];
                        } else {
                            $value = NULL;
                        }
                        $attributes[$name] = $value;
                    }
                }

                if (!preg_match($startElementEndPattern, $this->rawtext, $matches, NULL, $this->position)) {
                    break;
                }

                $this->position += strlen($matches[0]);

                $this->emitCharacters();
                if (isset($matches[1])) {
                    $this->handler->emptyElement($tag, $attributes);
                } else {
                    $this->handler->startElement($tag, $attributes);
                }

                // see http://www.w3.org/TR/REC-html40/appendix/notes.html#notes-specifying-data
                // for special handling issues with script and style tags

                break;
            }

        } while ($this->position < $this->length);

        // emit any extra characters left on the end
        if ($this->charStart < $this->length) {
            $this->handler->characters(substr($this->rawtext, $this->charStart));
        }

        $this->handler->endDocument();

    }

}
?>
