changes to
* * @todo check XML catalogs compatibility * @author Tobiasz Cudnik * @package phpQuery */ class DOMDocumentWrapper { /** * @var DOMDocument */ public $document; public $id; /** * @todo Rewrite as method and quess if null. * @var unknown_type */ public $contentType = ''; public $xpath; public $uuid = 0; public $data = array(); public $dataNodes = array(); public $events = array(); public $eventsNodes = array(); public $eventsGlobal = array(); /** * @TODO iframes support http://code.google.com/p/phpquery/issues/detail?id=28 * @var unknown_type */ public $frames = array(); /** * Document root, by default equals to document itself. * Used by documentFragments. * * @var DOMNode */ public $root; public $isDocumentFragment; public $isXML = false; public $isXHTML = false; public $isHTML = false; public $charset; public function __construct($markup = null, $contentType = null, $newDocumentID = null) { if (isset($markup)) $this->load($markup, $contentType, $newDocumentID); $this->id = $newDocumentID ? $newDocumentID : md5(microtime()); } public function load($markup, $contentType = null, $newDocumentID = null) { // phpQuery::$documents[$id] = $this; $this->contentType = strtolower($contentType); if ($markup instanceof DOMDOCUMENT) { $this->document = $markup; $this->root = $this->document; $this->charset = $this->document->encoding; // TODO isDocumentFragment } else { $loaded = $this->loadMarkup($markup); } if ($loaded) { // $this->document->formatOutput = true; $this->document->preserveWhiteSpace = true; $this->xpath = new DOMXPath($this->document); $this->afterMarkupLoad(); return true; // remember last loaded document // return phpQuery::selectDocument($id); } return false; } protected function afterMarkupLoad() { if ($this->isXHTML) { $this->xpath->registerNamespace("html", "http://www.w3.org/1999/xhtml"); } } protected function loadMarkup($markup) { $loaded = false; if ($this->contentType) { self::debug("Load markup for content type {$this->contentType}"); // content determined by contentType list($contentType, $charset) = $this->contentTypeToArray($this->contentType); switch($contentType) { case 'text/html': phpQuery::debug("Loading HTML, content type '{$this->contentType}'"); $loaded = $this->loadMarkupHTML($markup, $charset); break; case 'text/xml': case 'application/xhtml+xml': phpQuery::debug("Loading XML, content type '{$this->contentType}'"); $loaded = $this->loadMarkupXML($markup, $charset); break; default: // for feeds or anything that sometimes doesn't use text/xml if (strpos('xml', $this->contentType) !== false) { phpQuery::debug("Loading XML, content type '{$this->contentType}'"); $loaded = $this->loadMarkupXML($markup, $charset); } else phpQuery::debug("Could not determine document type from content type '{$this->contentType}'"); } } else { // content type autodetection if ($this->isXML($markup)) { phpQuery::debug("Loading XML, isXML() == true"); $loaded = $this->loadMarkupXML($markup); if (! $loaded && $this->isXHTML) { phpQuery::debug('Loading as XML failed, trying to load as HTML, isXHTML == true'); $loaded = $this->loadMarkupHTML($markup); } } else { phpQuery::debug("Loading HTML, isXML() == false"); $loaded = $this->loadMarkupHTML($markup); } } return $loaded; } protected function loadMarkupReset() { $this->isXML = $this->isXHTML = $this->isHTML = false; } protected function documentCreate($charset, $version = '1.0') { if (! $version) $version = '1.0'; $this->document = new DOMDocument($version, $charset); $this->charset = $this->document->encoding; // $this->document->encoding = $charset; $this->document->formatOutput = true; $this->document->preserveWhiteSpace = true; } protected function loadMarkupHTML($markup, $requestedCharset = null) { if (phpQuery::$debug) phpQuery::debug('Full markup load (HTML): '.substr($markup, 0, 250)); $this->loadMarkupReset(); $this->isHTML = true; if (!isset($this->isDocumentFragment)) $this->isDocumentFragment = self::isDocumentFragmentHTML($markup); $charset = null; $documentCharset = $this->charsetFromHTML($markup); $addDocumentCharset = false; if ($documentCharset) { $charset = $documentCharset; $markup = $this->charsetFixHTML($markup); } else if ($requestedCharset) { $charset = $requestedCharset; } if (! $charset) $charset = phpQuery::$defaultCharset; // HTTP 1.1 says that the default charset is ISO-8859-1 // @see http://www.w3.org/International/O-HTTP-charset if (! $documentCharset) { $documentCharset = 'ISO-8859-1'; $addDocumentCharset = true; } // Should be careful here, still need 'magic encoding detection' since lots of pages have other 'default encoding' // Worse, some pages can have mixed encodings... we'll try not to worry about that $requestedCharset = strtoupper($requestedCharset); $documentCharset = strtoupper($documentCharset); phpQuery::debug("DOC: $documentCharset REQ: $requestedCharset"); if ($requestedCharset && $documentCharset && $requestedCharset !== $documentCharset) { phpQuery::debug("CHARSET CONVERT"); // Document Encoding Conversion // http://code.google.com/p/phpquery/issues/detail?id=86 if (function_exists('mb_detect_encoding')) { $possibleCharsets = array($documentCharset, $requestedCharset, 'AUTO'); $docEncoding = mb_detect_encoding($markup, implode(', ', $possibleCharsets)); if (! $docEncoding) $docEncoding = $documentCharset; // ok trust the document phpQuery::debug("DETECTED '$docEncoding'"); // Detected does not match what document says... if ($docEncoding !== $documentCharset) { // Tricky.. } if ($docEncoding !== $requestedCharset) { phpQuery::debug("CONVERT $docEncoding => $requestedCharset"); $markup = mb_convert_encoding($markup, $requestedCharset, $docEncoding); $markup = $this->charsetAppendToHTML($markup, $requestedCharset); $charset = $requestedCharset; } } else { phpQuery::debug("TODO: charset conversion without mbstring..."); } } $return = false; if ($this->isDocumentFragment) { phpQuery::debug("Full markup load (HTML), DocumentFragment detected, using charset '$charset'"); $return = $this->documentFragmentLoadMarkup($this, $charset, $markup); } else { if ($addDocumentCharset) { phpQuery::debug("Full markup load (HTML), appending charset: '$charset'"); $markup = $this->charsetAppendToHTML($markup, $charset); } phpQuery::debug("Full markup load (HTML), documentCreate('$charset')"); $this->documentCreate($charset); $return = phpQuery::$debug === 2 ? $this->document->loadHTML($markup) : @$this->document->loadHTML($markup); if ($return) $this->root = $this->document; } if ($return && ! $this->contentType) $this->contentType = 'text/html'; return $return; } protected function loadMarkupXML($markup, $requestedCharset = null) { if (phpQuery::$debug) phpQuery::debug('Full markup load (XML): '.substr($markup, 0, 250)); $this->loadMarkupReset(); $this->isXML = true; // check agains XHTML in contentType or markup $isContentTypeXHTML = $this->isXHTML(); $isMarkupXHTML = $this->isXHTML($markup); if ($isContentTypeXHTML || $isMarkupXHTML) { self::debug('Full markup load (XML), XHTML detected'); $this->isXHTML = true; } // determine document fragment if (! isset($this->isDocumentFragment)) $this->isDocumentFragment = $this->isXHTML ? self::isDocumentFragmentXHTML($markup) : self::isDocumentFragmentXML($markup); // this charset will be used $charset = null; // charset from XML declaration @var string $documentCharset = $this->charsetFromXML($markup); if (! $documentCharset) { if ($this->isXHTML) { // this is XHTML, try to get charset from content-type meta header $documentCharset = $this->charsetFromHTML($markup); if ($documentCharset) { phpQuery::debug("Full markup load (XML), appending XHTML charset '$documentCharset'"); $this->charsetAppendToXML($markup, $documentCharset); $charset = $documentCharset; } } if (! $documentCharset) { // if still no document charset... $charset = $requestedCharset; } } else if ($requestedCharset) { $charset = $requestedCharset; } if (! $charset) { $charset = phpQuery::$defaultCharset; } if ($requestedCharset && $documentCharset && $requestedCharset != $documentCharset) { // TODO place for charset conversion // $charset = $requestedCharset; } $return = false; if ($this->isDocumentFragment) { phpQuery::debug("Full markup load (XML), DocumentFragment detected, using charset '$charset'"); $return = $this->documentFragmentLoadMarkup($this, $charset, $markup); } else { // FIXME ??? if ($isContentTypeXHTML && ! $isMarkupXHTML) if (! $documentCharset) { phpQuery::debug("Full markup load (XML), appending charset '$charset'"); $markup = $this->charsetAppendToXML($markup, $charset); } // see http://pl2.php.net/manual/en/book.dom.php#78929 // LIBXML_DTDLOAD (>= PHP 5.1) // does XML ctalogues works with LIBXML_NONET // $this->document->resolveExternals = true; // TODO test LIBXML_COMPACT for performance improvement // create document $this->documentCreate($charset); if (phpversion() < 5.1) { $this->document->resolveExternals = true; $return = phpQuery::$debug === 2 ? $this->document->loadXML($markup) : @$this->document->loadXML($markup); } else { /** @link http://pl2.php.net/manual/en/libxml.constants.php */ $libxmlStatic = phpQuery::$debug === 2 ? LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET : LIBXML_DTDLOAD|LIBXML_DTDATTR|LIBXML_NONET|LIBXML_NOWARNING|LIBXML_NOERROR; $return = $this->document->loadXML($markup, $libxmlStatic); // if (! $return) // $return = $this->document->loadHTML($markup); } if ($return) $this->root = $this->document; } if ($return) { if (! $this->contentType) { if ($this->isXHTML) $this->contentType = 'application/xhtml+xml'; else $this->contentType = 'text/xml'; } return $return; } else { throw new Exception("Error loading XML markup"); } } protected function isXHTML($markup = null) { if (! isset($markup)) { return strpos($this->contentType, 'xhtml') !== false; } // XXX ok ? return strpos($markup, "doctype) && is_object($dom->doctype) // ? $dom->doctype->publicId // : self::$defaultDoctype; } protected function isXML($markup) { // return strpos($markup, ']+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i', $markup, $matches ); if (! isset($matches[0])) return array(null, null); // get attr 'content' preg_match('@content\\s*=\\s*(["|\'])(.+?)\\1@', $matches[0], $matches); if (! isset($matches[0])) return array(null, null); return $this->contentTypeToArray($matches[2]); } protected function charsetFromHTML($markup) { $contentType = $this->contentTypeFromHTML($markup); return $contentType[1]; } protected function charsetFromXML($markup) { $matches; // find declaration preg_match('@<'.'?xml[^>]+encoding\\s*=\\s*(["|\'])(.*?)\\1@i', $markup, $matches ); return isset($matches[2]) ? strtolower($matches[2]) : null; } /** * Repositions meta[type=charset] at the start of head. Bypasses DOMDocument bug. * * @link http://code.google.com/p/phpquery/issues/detail?id=80 * @param $html */ protected function charsetFixHTML($markup) { $matches = array(); // find meta tag preg_match('@\s*]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i', $markup, $matches, PREG_OFFSET_CAPTURE ); if (! isset($matches[0])) return; $metaContentType = $matches[0][0]; $markup = substr($markup, 0, $matches[0][1]) .substr($markup, $matches[0][1]+strlen($metaContentType)); $headStart = stripos($markup, ''); $markup = substr($markup, 0, $headStart+6).$metaContentType .substr($markup, $headStart+6); return $markup; } protected function charsetAppendToHTML($html, $charset, $xhtml = false) { // remove existing meta[type=content-type] $html = preg_replace('@\s*]+http-equiv\\s*=\\s*(["|\'])Content-Type\\1([^>]+?)>@i', '', $html); $meta = ''; if (strpos($html, ')@s', "{$meta}", $html ); } } else { return preg_replace( '@)@s', ''.$meta, $html ); } } protected function charsetAppendToXML($markup, $charset) { $declaration = '<'.'?xml version="1.0" encoding="'.$charset.'"?'.'>'; return $declaration.$markup; } public static function isDocumentFragmentHTML($markup) { return stripos($markup, 'documentFragmentCreate($node, $sourceCharset); // if ($fake === false) // throw new Exception("Error loading documentFragment markup"); // else // $return = array_merge($return, // $this->import($fake->root->childNodes) // ); // } else { // $return[] = $this->document->importNode($node, true); // } // } // return $return; // } else { // // string markup // $fake = $this->documentFragmentCreate($source, $sourceCharset); // if ($fake === false) // throw new Exception("Error loading documentFragment markup"); // else // return $this->import($fake->root->childNodes); // } if (is_array($source) || $source instanceof DOMNODELIST) { // dom nodes self::debug('Importing nodes to document'); foreach($source as $node) $return[] = $this->document->importNode($node, true); } else { // string markup $fake = $this->documentFragmentCreate($source, $sourceCharset); if ($fake === false) throw new Exception("Error loading documentFragment markup"); else return $this->import($fake->root->childNodes); } return $return; } /** * Creates new document fragment. * * @param $source * @return DOMDocumentWrapper */ protected function documentFragmentCreate($source, $charset = null) { $fake = new DOMDocumentWrapper(); $fake->contentType = $this->contentType; $fake->isXML = $this->isXML; $fake->isHTML = $this->isHTML; $fake->isXHTML = $this->isXHTML; $fake->root = $fake->document; if (! $charset) $charset = $this->charset; // $fake->documentCreate($this->charset); if ($source instanceof DOMNODE && !($source instanceof DOMNODELIST)) $source = array($source); if (is_array($source) || $source instanceof DOMNODELIST) { // dom nodes // load fake document if (! $this->documentFragmentLoadMarkup($fake, $charset)) return false; $nodes = $fake->import($source); foreach($nodes as $node) $fake->root->appendChild($node); } else { // string markup $this->documentFragmentLoadMarkup($fake, $charset, $source); } return $fake; } /** * * @param $document DOMDocumentWrapper * @param $markup * @return $document */ private function documentFragmentLoadMarkup($fragment, $charset, $markup = null) { // TODO error handling // TODO copy doctype // tempolary turn off $fragment->isDocumentFragment = false; if ($fragment->isXML) { if ($fragment->isXHTML) { // add FAKE element to set default namespace $fragment->loadMarkupXML('' .'' .''.$markup.''); $fragment->root = $fragment->document->firstChild->nextSibling; } else { $fragment->loadMarkupXML(''.$markup.''); $fragment->root = $fragment->document->firstChild; } } else { $markup2 = phpQuery::$defaultDoctype.''; $noBody = strpos($markup, 'loadMarkupHTML($markup2); // TODO resolv body tag merging issue $fragment->root = $noBody ? $fragment->document->firstChild->nextSibling->firstChild->nextSibling : $fragment->document->firstChild->nextSibling->firstChild->nextSibling; } if (! $fragment->root) return false; $fragment->isDocumentFragment = true; return true; } protected function documentFragmentToMarkup($fragment) { phpQuery::debug('documentFragmentToMarkup'); $tmp = $fragment->isDocumentFragment; $fragment->isDocumentFragment = false; $markup = $fragment->markup(); if ($fragment->isXML) { $markup = substr($markup, 0, strrpos($markup, '')); if ($fragment->isXHTML) { $markup = substr($markup, strpos($markup, '')+6); } } else { $markup = substr($markup, strpos($markup, '')+6); $markup = substr($markup, 0, strrpos($markup, '')); } $fragment->isDocumentFragment = $tmp; if (phpQuery::$debug) phpQuery::debug('documentFragmentToMarkup: '.substr($markup, 0, 150)); return $markup; } /** * Return document markup, starting with optional $nodes as root. * * @param $nodes DOMNode|DOMNodeList * @return string */ public function markup($nodes = null, $innerMarkup = false) { if (isset($nodes) && count($nodes) == 1 && $nodes[0] instanceof DOMDOCUMENT) $nodes = null; if (isset($nodes)) { $markup = ''; if (!is_array($nodes) && !($nodes instanceof DOMNODELIST) ) $nodes = array($nodes); if ($this->isDocumentFragment && ! $innerMarkup) foreach($nodes as $i => $node) if ($node->isSameNode($this->root)) { // var_dump($node); $nodes = array_slice($nodes, 0, $i) + phpQuery::DOMNodeListToArray($node->childNodes) + array_slice($nodes, $i+1); } if ($this->isXML && ! $innerMarkup) { self::debug("Getting outerXML with charset '{$this->charset}'"); // we need outerXML, so we can benefit from // $node param support in saveXML() foreach($nodes as $node) $markup .= $this->document->saveXML($node); } else { $loop = array(); if ($innerMarkup) foreach($nodes as $node) { if ($node->childNodes) foreach($node->childNodes as $child) $loop[] = $child; else $loop[] = $node; } else $loop = $nodes; self::debug("Getting markup, moving selected nodes (".count($loop).") to new DocumentFragment"); $fake = $this->documentFragmentCreate($loop); $markup = $this->documentFragmentToMarkup($fake); } if ($this->isXHTML) { self::debug("Fixing XHTML"); $markup = self::markupFixXHTML($markup); } self::debug("Markup: ".substr($markup, 0, 250)); return $markup; } else { if ($this->isDocumentFragment) { // documentFragment, html only... self::debug("Getting markup, DocumentFragment detected"); // return $this->markup( //// $this->document->getElementsByTagName('body')->item(0) // $this->document->root, true // ); $markup = $this->documentFragmentToMarkup($this); // no need for markupFixXHTML, as it's done thought markup($nodes) method return $markup; } else { self::debug("Getting markup (".($this->isXML?'XML':'HTML')."), final with charset '{$this->charset}'"); $markup = $this->isXML ? $this->document->saveXML() : $this->document->saveHTML(); if ($this->isXHTML) { self::debug("Fixing XHTML"); $markup = self::markupFixXHTML($markup); } self::debug("Markup: ".substr($markup, 0, 250)); return $markup; } } } protected static function markupFixXHTML($markup) { $markup = self::expandEmptyTag('script', $markup); $markup = self::expandEmptyTag('select', $markup); $markup = self::expandEmptyTag('textarea', $markup); return $markup; } public static function debug($text) { phpQuery::debug($text); } /** * expandEmptyTag * * @param $tag * @param $xml * @return unknown_type * @author mjaque at ilkebenson dot com * @link http://php.net/manual/en/domdocument.savehtml.php#81256 */ public static function expandEmptyTag($tag, $xml){ $indice = 0; while ($indice< strlen($xml)){ $pos = strpos($xml, "<$tag ", $indice); if ($pos){ $posCierre = strpos($xml, ">", $pos); if ($xml[$posCierre-1] == "/"){ $xml = substr_replace($xml, ">", $posCierre-1, 2); } $indice = $posCierre; } else break; } return $xml; } }