get('Core.LexerImpl');
}
$needs_tracking =
$config->get('Core.MaintainLineNumbers') ||
$config->get('Core.CollectErrors');
$inst = null;
if (is_object($lexer)) {
$inst = $lexer;
} else {
if (is_null($lexer)) { do {
// auto-detection algorithm
if ($needs_tracking) {
$lexer = 'DirectLex';
break;
}
if (
class_exists('DOMDocument') &&
method_exists('DOMDocument', 'loadHTML') &&
!extension_loaded('domxml')
) {
// check for DOM support, because while it's part of the
// core, it can be disabled compile time. Also, the PECL
// domxml extension overrides the default DOM, and is evil
// and nasty and we shan't bother to support it
$lexer = 'DOMLex';
} else {
$lexer = 'DirectLex';
}
} while(0); } // do..while so we can break
// instantiate recognized string names
switch ($lexer) {
case 'DOMLex':
$inst = new HTMLPurifier_Lexer_DOMLex();
break;
case 'DirectLex':
$inst = new HTMLPurifier_Lexer_DirectLex();
break;
case 'PH5P':
$inst = new HTMLPurifier_Lexer_PH5P();
break;
default:
throw new HTMLPurifier_Exception("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer));
}
}
if (!$inst) throw new HTMLPurifier_Exception('No lexer was instantiated');
// once PHP DOM implements native line numbers, or we
// hack out something using XSLT, remove this stipulation
if ($needs_tracking && !$inst->tracksLineNumbers) {
throw new HTMLPurifier_Exception('Cannot use lexer that does not support line numbers with Core.MaintainLineNumbers or Core.CollectErrors (use DirectLex instead)');
}
return $inst;
}
// -- CONVENIENCE MEMBERS ---------------------------------------------
public function __construct() {
$this->_entity_parser = new HTMLPurifier_EntityParser();
}
/**
* Most common entity to raw value conversion table for special entities.
*/
protected $_special_entity2str =
array(
'"' => '"',
'&' => '&',
'<' => '<',
'>' => '>',
''' => "'",
''' => "'",
''' => "'"
);
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param $string String character data to be parsed.
* @returns Parsed character data.
*/
public function parseData($string) {
// following functions require at least one character
if ($string === '') return '';
// subtracts amps that cannot possibly be escaped
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if (!$num_amp) return $string; // abort if no entities
$num_esc_amp = substr_count($string, '&');
$string = strtr($string, $this->_special_entity2str);
// code duplication for sake of optimization, see above
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
($string[strlen($string)-1] === '&' ? 1 : 0);
if ($num_amp_2 <= $num_esc_amp) return $string;
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities($string);
return $string;
}
/**
* Lexes an HTML string into tokens.
*
* @param $string String HTML.
* @return HTMLPurifier_Token array representation of HTML.
*/
public function tokenizeHTML($string, $config, $context) {
trigger_error('Call to abstract class', E_USER_ERROR);
}
/**
* Translates CDATA sections into regular sections (through escaping).
*
* @param $string HTML string to process.
* @returns HTML with CDATA sections escaped.
*/
protected static function escapeCDATA($string) {
return preg_replace_callback(
'//s',
array('HTMLPurifier_Lexer', 'CDATACallback'),
$string
);
}
/**
* Special CDATA case that is especially convoluted for