- Jun. 30th, 2006
- 1 comments
<?
// +———————————————————————-+
// | HTMLParser |
// | Simple HTML Parsing Library |
// | Based on Jose Solorzano's Library; his notice is below. |
// +———————————————————————-+
// | Portions Copyright (c) 2004-2005 Jaimie Sirovich |
// +———————————————————————-+
// | This program is free software; you can redistribute it and/or |
// | modify it under the terms of the GNU General Public License |
// | as published by the Free Software Foundation; either version 2 |
// | of the License, or (at your option) any later version. |
// | |
// | This program is distributed in the hope that it will be useful, |
// | but WITHOUT ANY WARRANTY; without even the implied warranty of |
// | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
// | GNU General Public License for more details. |
// | |
// | You should have received a copy of the GNU General Public License |
// | along with this program; if not, write to the Free Software |
// | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA |
// +———————————————————————-+
// | Author: Jaimie Sirovich <jsirovic@gmail.com> |
// +———————————————————————-+
/*
* Copyright (c) 2003 Jose Solorzano. All rights reserved.
* Redistribution of source must retain this copyright notice.
*
* Jose Solorzano (http://jexpert.us) is a software consultant.
*
* Contributions by:
* Leo West (performance improvements)
*/
define ("NODE_TYPE_START", 0);
define ("NODE_TYPE_BEGINELEMENT", 1);
define ("NODE_TYPE_ENDELEMENT", 2);
define ("NODE_TYPE_TEXT", 3);
define ("NODE_TYPE_COMMENT", 4);
define ("NODE_TYPE_FINISH", 5);
class HTMLParser {
/*
* Field nodeType.
* One of the NODE_TYPE_X constants above.
*/
var $nodeType;
/*
* Field nodeName.
* Name of the node/tag.
*/
var $nodeName;
/*
* Field nodeValue.
* The text in the node/tag.
*/
var $nodeValue;
/*
* Field nodeAttributes.
* Hash containing attribute values for the current node.
*/
var $nodeAttributes;
var $_htmlText;
var $_htmlTextLength;
var $_htmlTextIndex;
var $_currentChar;
var $_B_ARRAY;
var $_BOE_ARRAY;
var $_BOS_ARRAY;
function HTMLParser($str)
{
$this->_htmlText = $str;
$this->_htmlTextLength = strlen($this->_htmlText);
$this->_setTextIndex(0);
$this->_B_ARRAY = array (" ", "\t", "\r", "\n");
$this->_BOE_ARRAY = array (" ", "\t", "\r", "\n", "=");
$this->_BOS_ARRAY = array (" ", "\t", "\r", "\n", "/");
$this->nodeType = null;
$this->nodeName = '';
$this->nodeValue = '';
$this->nodeAttributes = array();
}
function getNodeType()
{
return $this->nodeType;
}
function getNodeName()
{
return $this->nodeName;
}
function getNodeValue()
{
return $this->nodeValue;
}
function getNodeAttributes()
{
return $this->nodeAttributes;
}
function parse()
{
$text = $this->_skipToElement();
if ($text != '') {
$this->nodeType = NODE_TYPE_TEXT;
$this->nodeName = '_TEXT';
$this->nodeValue = $text;
return true;
} else {
return $this->_readTag();
}
}
function _clearAttributes()
{
$this->nodeAttributes = array();
}
function _readTag()
{
if ($this->_currentChar != '<') {
$this->nodeType = NODE_TYPE_FINISH;
return false;
}
$this->_clearAttributes();
$this->_skipMaxInTag('<', 1);
if ($this->_currentChar == '/') {
$this->_moveNext();
$name = $this->_skipToBlanksInTag();
$this->nodeType = NODE_TYPE_ENDELEMENT;
$this->nodeName = $name;
$this->nodeValue = '';
$this->_skipEndOfTag();
return true;
}
$name = $this->_skipToBlanksOrSlashInTag();
if (!$this->_isValidTagIdentifier($name)) {
$comment = false;
if (strpos($name, '!–') === 0) {
$ppos = strpos($name, '–', 3);
if (strpos($name, '–', 3) === (strlen($name) - 2)) {
$this->nodeType = NODE_TYPE_COMMENT;
$this->nodeName = 'COMMENT';
$this->nodeValue = '<' . $name . '>';
$comment = true;
} else {
$rest = $this->_skipToStringInTag ('–>');
if ($rest != '') {
$this->nodeType = NODE_TYPE_COMMENT;
$this->nodeName = 'COMMENT';
$this->nodeValue = '<' . $name . $rest;
$comment = true;
// already skipped end of tag
return true;
}
}
}
if (!$comment) {
$this->nodeType = NODE_TYPE_TEXT;
$this->nodeName = '_TEXT';
$this->nodeValue = '<' . $name;
return true;
}
} else {
$this->nodeType = NODE_TYPE_BEGINELEMENT;
$this->nodeValue = '';
$this->nodeName = $name;
while ($this->_skipBlanksInTag()) {
$attrName = $this->_skipToBlanksOrEqualsInTag();
if ($attrName != '' && $attrName != '/') {
$this->_skipBlanksInTag();
if ($this->_currentChar == '=') {
$this->_skipEqualsInTag();
$this->_skipBlanksInTag();
$value = $this->_readValueInTag();
$this->nodeAttributes[strtolower($attrName)] = $value;
} else {
$this->nodeAttributes[strtolower($attrName)] = '';
}
}
}
}
$this->_skipEndOfTag();
return true;
}
function _isValidTagIdentifier($name)
{
return preg_match("#^[A-Za-z0-9_\-]+$#", $name);
}
function _skipBlanksInTag()
{
return $this->_skipInTag($this->_B_ARRAY) != '';
}
function _skipToBlanksOrEqualsInTag()
{
return $this->_skipToInTag($this->_BOE_ARRAY);
}
function _skipToBlanksInTag()
{
return $this->_skipToInTag($this->_B_ARRAY);
}
function _skipToBlanksOrSlashInTag()
{
return $this->_skipToInTag($this->_BOS_ARRAY);
}
function _skipEqualsInTag()
{
return $this->_skipMaxInTag("=", 1);
}
function _readValueInTag()
{
$ch = $this->_currentChar;
$value = '';
if ($ch == '"') {
$this->_skipMaxInTag('"', 1);
$value = $this->_skipToInTag('"');
$this->_skipMaxInTag('"', 1);
} else if ($ch == "'") {
$this->_skipMaxInTag ("'", 1);
$value = $this->_skipToInTag ("'");
$this->_skipMaxInTag ("'", 1);
} else {
$value = $this->_skipToBlanksInTag();
}
return $value;
}
function _setTextIndex($index)
{
$this->_htmlTextIndex = $index;
if ($index >= $this->_htmlTextLength) {
$this->_currentChar = -1;
return false;
} else {
$this->_currentChar = $this->_htmlText{$index};
return true;
}
}
function _moveNext()
{
if ($this->_htmlTextIndex < $this->_htmlTextLength) {
$this->_setTextIndex($this->_htmlTextIndex + 1);
return true;
} else {
return false;
}
}
function _skipEndOfTag()
{
while (($ch = $this->_currentChar) !== -1) {
if ($ch == '>') {
$this->_moveNext();
return;
}
$this->_moveNext();
}
}
function _skipInTag($chars)
{
$sb = '';
while (($ch = $this->_currentChar) !== -1) {
if ($ch == '>') {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->_moveNext();
}
}
return $sb;
}
function _skipMaxInTag($chars, $maxChars)
{
$sb = '';
$count = 0;
while (($ch = $this->_currentChar) !== -1 && $count++ < $maxChars) {
if ($ch == '>') {
return $sb;
} else {
$match = false;
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
if (!$match) {
return $sb;
}
$sb .= $ch;
$this->_moveNext();
}
}
return $sb;
}
// Consume everything up to certain character(s) within a tag; return the consumed string.
function _skipToInTag($chars)
{
$sb = '';
while (($ch = $this->_currentChar) !== -1) {
$match = $ch == '>';
if (!$match) {
for ($idx = 0; $idx < count($chars); $idx++) {
if ($ch == $chars[$idx]) {
$match = true;
break;
}
}
}
if ($match) {
return $sb;
}
$sb .= $ch;
$this->_moveNext();
}
return $sb;
}
// Consume everything up to the beginning of a tag; return the consumed string.
function _skipToElement()
{
$sb = '';
while (($ch = $this->_currentChar) !== -1) {
if ($ch == '<') {
return $sb;
}
$sb .= $ch;
$this->_moveNext();
}
return $sb;
}
/**
* Returns text between current position and $needle,
* inclusive, or "" if not found. The current index is moved to a point
* after the location of $needle, or not moved at all
* if nothing is found.
*/
function _skipToStringInTag($needle)
{
$pos = strpos($this->_htmlText, $needle, $this->_htmlTextIndex);
if ($pos === false) {
return '';
}
$top = $pos + strlen($needle);
$tmp = substr($this->_htmlText, $this->_htmlTextIndex, $top - $this->_htmlTextIndex);
$this->_setTextIndex($top);
return $tmp;
}
function findBlocks($tag, $str)
{
$tmp = preg_match_all("#(<$tag.*?>.*?</$tag>)#is", $str, $captures, PREG_SET_ORDER);
return $captures;
}
}
?><pre><?
$_sentinel = '<h1>testing 123</h1>';
// Some blogs I happen to like. Duh, none of them are exploitable, but go find some pages on your own to check.
// Note: This checks pages, not sites; so the URLs below reference the home pages.
$page_urls = array('http://www.seoegghead.com', 'http://www.seroundtable.com', 'http://www.seobythesea.com', 'http://www.seoblackhat.com', 'http://ha.ckers.org');
foreach ($page_urls as $page_url) {
$page_url_parsed = parse_url($page_url);
$page_forms = HTMLParser::findBlocks('form', file_get_contents($page_url));
echo "<b>Looking at $page_url; " . sizeof($page_forms) . ' form(s) found.</b><br>';
$form_cnt = 0;
foreach ($page_forms as $page_form) {
$form_cnt++;
$page_form = $page_form[0];
var_dump($page_form);
$parser = new HTMLParser($page_form);
$values = array();
$first_text = false;
while ($parser->parse()) {
if ($parser->getNodeName() == 'form' && $parser->getNodeType() == NODE_TYPE_BEGINELEMENT) {
$_tmp = $parser->getNodeAttributes();
$_action = $_tmp['action'];
$_method = $_tmp['method'];
} else if ($parser->getNodeName() == 'input') {
$_tmp = $parser->getNodeAttributes();
if (($_tmp['type'] == 'checkbox' || $_tmp['type'] == 'radio') && $_tmp['name'] && isset($_tmp['checked'])) {
$values[$_tmp['name']] = $_tmp['value'] ? $_tmp['value'] : 'on';
} else if ($_tmp['type'] == 'text' || $_tmp['type'] == '' & $_tmp['name']) {
$values[$_tmp['name']] = $_tmp['value'];
if (!$first_text) {
$values[$_tmp['name']] = $_sentinel;
$first_text = true;
}
} else if ($_tmp['type'] == 'hidden' & $_tmp['name']) {
$values[$_tmp['name']] = $_tmp['value'];
}
} else if (($parser->getNodeName() == 'select') && $parser->getNodeType() == NODE_TYPE_BEGINELEMENT) {
$_tmp = $parser->getNodeAttributes();
$_select_name = $_tmp['name'];
while ($parser->parse() && !($parser->getNodeName() == 'select' && $parser->getNodeType() == NODE_TYPE_ENDELEMENT)) {
$_tmp = $parser->getNodeAttributes();
if (isset($_tmp['selected']))
$values[$_select_name] = $_tmp['value'];
}
}
}
$url_parts = array();
foreach ($values as $param => $value) {
$url_parts[] = "$param=" . urlencode($value);
}
$url_str = implode('&', $url_parts);
$ch = curl_init();
$submit_url = '';
if ($_action) {
$submit_url = $_action;
if (!preg_match('#http#', $_action)) {
$submit_url = 'http://' . $page_url_parsed['host'] . $submit_url;
}
} else {
$submit_url = $page_url;
}
curl_setopt($ch, CURLOPT_URL, $submit_url . '?' . $url_str);
curl_setopt($ch, CURLOPT_FAILONERROR, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_HEADER, 0);
$http_result = curl_exec($ch);
curl_close($ch);
if (preg_match("#$_sentinel#", $http_result)) {
echo 'UH OH; pontentially viable attack on: ' . $page_url . "; form $form_cnt<br>";
} else {
echo 'HRMM; no attack found for: ' . $page_url . "; form $form_cnt<br>";
}
}
}
?>
"Only One Wise Comment Banged Out Somewhere On The Internet ..."SEO Egghead » Blog Archive » Auditing for HTML Tainting[...] Code for HTML Auditing Archives [...]
|
















