Note: I didn't realize WP changes quotes to curly quotes to look "pretty" since version 2.1. I turned that feature off. Cut and pasting should work.
Below is the main class necessary for the cloaking functionality, "SimpleCloakV2:"
<?php
$__metaRobotsExcludeProxiesCallbackHTML = '';
/*
// +----------------------------------------------------------------------+
// | SimpleCloakV2 Version 2 |
// | Class for cloaking content |
// | http://www.SEOEgghead.com |
// +----------------------------------------------------------------------+
// | Copyright (c) 2005-2006 Jaimie Sirovich and Cristian Darie |
// +----------------------------------------------------------------------+
*/
// load configuration file
require_once('config.inc.php');
class SimpleCloakV2
{
function _connect()
{
if (USE_CUSTOM_CONNECT_CODE) return true;
// Connect to MySQL server
$dbLink = mysql_connect(DB_HOST, DB_USER, DB_PASSWORD)
or die("Could not connect: " . mysql_error());
// Connect to the seophp database
mysql_select_db(DB_DATABASE) or die("Could not select database");
return $dbLink;
}
function _close($dbLink)
{
if (USE_CUSTOM_CONNECT_CODE) return true;
// close database connection
mysql_close($dbLink);
}
// returns the confidence level
function isSpider($spider_name = '', $check_uas = true, $check_ips = true, $use_user_defined_data = true, $ignore_bad_uas = true)
{
// default confidence level to 0
$confidence = 0;
// matching user agent?
if ($check_uas)
if (SimpleCloakV2::_get(0, $spider_name, 'UA', $_SERVER['HTTP_USER_AGENT'], '', $use_user_defined_data ? '' : 'N', $ignore_bad_uas ? 'bad' : ''))
$confidence += 2;
// matching IP?
if ($check_ips)
if (SimpleCloakV2::_get(0, $spider_name, 'IP', '', $_SERVER['REMOTE_ADDR'], $use_user_defined_data ? '' : 'N', $ignore_bad_uas ? 'bad' : ''))
$confidence += 3;
// return confidence level
return $confidence;
}
// retrieve cloaking data filtered by the supplied parameters
function _get($id = 0, $spider_name = '', $record_type = '',
$value = '', $wildcard_value = '', $is_user_defined_data = '', $not_spider_name = '')
{
// by default, retrieve all records
$q = " SELECT cloak_data.* FROM cloak_data WHERE TRUE ";
// add filters
if ($id) {
$id = (int) $id;
$q .= " AND id = $id ";
}
if ($spider_name) {
$spider_name = mysql_escape_string($spider_name);
$q .= " AND spider_name = '$spider_name' ";
}
if ($record_type) {
$record_type = mysql_escape_string($record_type);
$q .= " AND record_type = '$record_type' ";
}
if ($value) {
$value = mysql_escape_string($value);
$q .= " AND value = '$value' ";
}
if ($wildcard_value) {
$wildcard_value = mysql_escape_string($wildcard_value);
$q .= " AND ( '$wildcard_value' = value OR '$wildcard_value' LIKE CONCAT(value, '.%') ) ";
}
if ($is_user_defined_data) {
$is_user_defined_data = mysql_escape_string($is_user_defined_data);
$q .= " AND is_user_defined_data = '$is_user_defined_data' ";
}
if ($not_spider_name) {
$not_spider_name = mysql_escape_string($not_spider_name);
$q .= " AND spider_name <> '$not_spider_name' ";
}
$dbLink = SimpleCloakV2::_connect();
// execute the query
$tmp = mysql_query($q);
SimpleCloakV2::_close($dbLink);
// return the results as an associative array
$rows = array();
while ($_x = mysql_fetch_assoc($tmp)) {
$rows[] = $_x;
}
return $rows;
}
// updates the entire database with fresh spider data, but only if our data is
// more than 7 days old, and if the online version from iplists.org has changed
function updateAll($delete_user_defined_data = false)
{
$dbLink = SimpleCloakV2::_connect();
// retrieve last update information from database
$q = "SELECT cloak_update.* FROM cloak_update";
$tmp = mysql_query($q);
$updated = mysql_fetch_assoc($tmp);
$db_version = $updated['version'];
$updated_on = $updated ['updated_on'];
// get the latest update more recent than 7 days, don't attempt an update
if (isset($updated_on) &&
(strtotime($updated_on) > strtotime("-604800 seconds")))
{
// close database connection
SimpleCloakV2::_close($dbLink);
// return false to indicate an update wasn't performed
return false;
}
// read the latest iplists version
$version_url = 'http://www.iplists.com/nw/version.php';
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $version_url);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
$latest_version = curl_exec($ch);
curl_close($ch);
$latest_version = mysql_escape_string($latest_version);
// if no updated version information was retrieved, abort
if (!$latest_version)
{
// return false to indicate an update wasn't performed
return false;
}
// save the update data
$q = "DELETE FROM cloak_update";
mysql_query($q);
$q = "INSERT INTO cloak_update (version, updated_on) " .
"VALUES('$latest_version', NOW())";
mysql_query($q);
// if we already have the current data, don't attempt an update
if ($latest_version == $db_version)
{
// close database connection
mysql_close($dbLink);
// return false to indicate an update wasn't performed
return false;
}
// update the database
SimpleCloakV2::_updateCloakingDB('google',
'http://www.iplists.com/nw/google.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('yahoo',
'http://www.iplists.com/nw/inktomi.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('msn',
'http://www.iplists.com/nw/msn.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('ask',
'http://www.iplists.com/nw/askjeeves.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('altavista',
'http://www.iplists.com/nw/altavista.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('lycos',
'http://www.iplists.com/nw/lycos.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('wisenut',
'http://www.iplists.com/nw/wisenut.txt', $delete_user_defined_data);
// close connection
SimpleCloakV2::_close($dbLink);
// return true to indicate a successful update
return true;
}
// update the database for the mentioned spider, by reading the provided URL
function _updateCloakingDB($spider_name, $url, $delete_user_defined_data = false)
{
$ua_regex = '/^# UA "(.*)"$/m';
$ip_regex = '/^([0-9.]+)$/m';
// use cURL to read the data from $url
// NOTE: additional settings are required when accessing the web through a proxy
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 1);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
$result = curl_exec($ch);
curl_close($ch);
// use _parseListURL to parse the list of IPs and user agents
$lists = SimpleCloakV2::_parseListURL($result, $ua_regex, $ip_regex);
// if the user agents and IPs weren't retrieved, we cancel the update
if (!$lists['ua_list'] || !$lists['ip_list']) return;
// lock the cloack_data table to avoid concurrency problems
mysql_query('LOCK TABLES cloak_data WRITE');
// delete all the existing data for $spider_name
SimpleCloakV2::_deleteSpiderData($spider_name, $delete_user_defined_data ? '' : 'N');
// insert the list of user agents for the spider
foreach ($lists['ua_list'] as $ua) {
SimpleCloakV2::_insertSpiderData($spider_name, 'UA', $ua);
}
// insert the list of IPs for the spider
foreach ($lists['ip_list'] as $ip) {
SimpleCloakV2::_insertSpiderData($spider_name, 'IP', $ip);
}
// release the table lock
mysql_query('UNLOCK TABLES');
}
// helper function used to parse lists of user agents and IPs
function _parseListURL($data, $ua_regex, $ip_regex)
{
$ua_list_ret = preg_match_all($ua_regex, $data, $ua_list);
$ip_list_ret = preg_match_all($ip_regex, $data, $ip_list);
return array('ua_list' => $ua_list[1], 'ip_list' => $ip_list[1]);
}
// inserts a new row of data to the cloaking table
function _insertSpiderData($spider_name, $record_type, $value, $is_user_defined = 'N')
{
// escape input data
$spider_name = mysql_escape_string($spider_name);
$record_type = mysql_escape_string($record_type);
$value = mysql_escape_string($value);
$is_user_defined = mysql_escape_string($is_user_defined);
// build and execute the INSERT query
$q = "INSERT INTO cloak_data (spider_name, record_type, value, is_user_defined) " .
"VALUES ('$spider_name', '$record_type', '$value', '$is_user_defined')";
mysql_query($q);
}
// delete the cloaking data for the mentioned spider
function _deleteSpiderData($spider_name, $is_user_defined = '')
{
// escape input data
$spider_name = mysql_escape_string($spider_name);
// build and execute the DELETE query
$q = "DELETE FROM cloak_data WHERE spider_name='$spider_name'";
if ($is_user_defined) {
$is_user_defined = mysql_escape_string($is_user_defined);
$q .= " AND is_user_defined = '$is_user_defined' ";
}
mysql_query($q);
}
// only use if it's not found via the IPLists cloaking database
function botVerifyByDNS($ua = array('google', '#.*\.googlebot\.com$#'))
{
// check cache of bad bots
if (SimpleCloakV2::isSpider('bad', false, true, true, false)) {
return false;
}
// check only UA since this function is only called if the cloaking DB doesn't handle it
if (SimpleCloakV2::isSpider($ua[0], true, false)) {
// reverse lookup
$host_name = gethostbyaddr($_SERVER['REMOTE_ADDR']);
// if it says it's a certain UA but gethostbyaddr the corresponding domain regex, store it and then abort
if (!preg_match($ua[1], $host_name)) {
$dbLink = SimpleCloakV2::_connect();
SimpleCloakV2::_insertSpiderData('bad', 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
SimpleCloakV2::_close($dbLink);
return false;
}
$connected_ip_address = $_SERVER['REMOTE_ADDR'];
$host_name_ip_address = gethostbyname($host_name);
// if the connected IP matches the authoritative IP, we have a match
if ($connected_ip_address == $host_name_ip_address) {
$dbLink = SimpleCloakV2::_connect();
SimpleCloakV2::_insertSpiderData($ua[0], 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
SimpleCloakV2::_close($dbLink);
return true;
} else {
// if it says it's a certain UA, gethostbyaddr says the right thing, but gethostbyname does not
$dbLink = SimpleCloakV2::_connect();
SimpleCloakV2::_insertSpiderData('bad', 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
SimpleCloakV2::_close($dbLink);
return false;
}
}
// it does not even say it's a bot via UA
return false;
}
function _addMetaRobotsExcludeProxiesCallback($buffer)
{
global $__metaRobotsExcludeProxiesCallbackHTML;
return preg_replace('#</title>#', '</title>' . $__metaRobotsExcludeProxiesCallbackHTML, $buffer);
}
function metaRobotsExcludeProxies($auto_modify_content = true, $uas = array(array('google', '#.*\.googlebot\.com$#'), array('yahoo', '#.*\.yahoo\.net$#'), array('msn', '#.*\.live\.com$#'), array('ask', '#.*\.ask.com$#') ), $meta_tag = '<meta name="robots" content="noindex,nofollow" />', $passlist_regex = '')
{
global $__metaRobotsExcludeProxiesCallbackHTML;
if ($meta_tag)
$__metaRobotsExcludeProxiesCallbackHTML = $meta_tag;
// if it's on our passlist
// ex: #become|lycos|somestupidbot#
if ($passlist_regex) {
if (preg_match($passlist_regex, $_SERVER['HTTP_USER_AGENT'])) return false;
}
foreach ($uas as $u) {
// if it's a bot according to UA, then start to investigate
if (SimpleCloakV2::isSpider($u[0], true, false)) {
// if it's a bot according to IPLists or our user-defined list
if (SimpleCloakV2::isSpider($u[0], false, true)) {
return false;
// if it's a bot according to DNS
} else if (SimpleCloakV2::botVerifyByDNS($u)) {
return false;
// if it's not
} else {
if ($auto_modify_content) ob_start(array('SimpleCloakV2', '_addMetaRobotsExcludeProxiesCallback'));
return true;
}
}
}
// it's not a bot according to UA
if ($auto_modify_content) ob_start(array('SimpleCloakV2', '_addMetaRobotsExcludeProxiesCallback'));
return true + 1;
}
}
?>
Save this file as "simple_cloak_v2.php."
You will also need the configuration file (it is referenced in "simple_cloak_v2.php"):
<?php
// defines database connection data
// set to "1" if you are already connected in your application.
define("USE_CUSTOM_CONNECT_CODE", 0);
// usually localhost
define("DB_HOST", "your_db_host");
// db user
define("DB_USER", "some_user");
// password
define("DB_PASSWORD", "secret");
//db name
define("DB_DATABASE", "your_db");
?>
Save this as "config.inc.php."
Then, to implement:
Use this SQL to create the database tables needed for the SimpleCloakV2 class
Run the following queries in your mySQL database (using the mysql binary or phpmysqladmin):
CREATE TABLE `cloak_data` (
`id` int(11) NOT NULL auto_increment,
`spider_name` varchar(255) NOT NULL default '',
`record_type` enum('UA','IP') NOT NULL default 'UA',
`value` varchar(255) NOT NULL default '',
`is_user_defined` enum('N','Y') NOT NULL default 'N',
PRIMARY KEY (`id`),
KEY `value` (`value`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
CREATE TABLE `cloak_update` (
`version` varchar(255) NOT NULL default '',
`updated_on` datetime NOT NULL default '0000-00-00 00:00:00'
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
Only if you already have a "cloak_data" table (from our book or a previous version of SimpleCloak on the blog), run this SQL:
ALTER TABLE cloak_data ADD `is_user_defined` ENUM('N','Y') NOT NULL;
Populate the Cloaking database with the data from IPLists.com
Note: This should be run periodically from a cron job to keep the data updated. It will update only once a week regardless. However, you may also put it in the footer of an application.
<?php
// load the SimpleCloakV2 library
require_once 'simple_cloak_v2.inc.php';
// update cloaking data and indicate the success status
if (SimpleCloakV2::updateAll())
{
echo "Cloaking database updated!";
}
else
{
echo "Cloaking database was already up to date, or the update failed.";
}
?>
Then pick *1* of the following methods.
Note: Method #2 is a bit of a kludge, as the RewriteMap directive of Apache cannot be used in .htaccess. *It has not been tested extensively yet!*
METHOD NUMBER 1 -- PHP Implementation
Place this code at the top of your application (or relevant parts thereof):
<?
include_once('simple_cloak_v2.inc.php');
$_x = SimpleCloakV2::metaRobotsExcludeProxies();
?>
The code automatically inserts the meta tag using PHP output buffering. If you want a more custom/efficient solution, that is also possible. See the first parameter of function "metaRobotsExcludeProxies." Set to false, it will not use the output buffering, and you may use the result to effect changes in your application as desired.
METHOD NUMBER 2 -- .htaccess Implementation
Place this in your .htaccess file
RewriteEngine On
RewriteCond %{HTTP_USER_AGENT} yahoo|slurp|msn|ask|google|gsa [NC]
RewriteRule (^.*$) proxy.php?orig_url=$1
And this is the code for proxy.php:
<?
include ('simple_cloak_v2.inc.php');
// should we deny access?
if (SimpleCloakV2::metaRobotsExcludeProxies(false)) {
header("HTTP/1.0 403 Forbidden");
echo 'forbidden ... ';
exit();
}
// otherwise echo as it was ...
// construct the original URL
$url = $_SERVER['SERVER_NAME'] . '/' . $_SERVER['REQUEST_URI'];
// get the contents
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 1);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec($ch);
curl_close($ch);
// do some parsing
preg_match("#(.*)\r\n\r(.*)#s", $result, $captures);
$headers = $captures[1];
$data = $captures[2];
preg_match_all('#(.*)\r#m', $headers, $captures);
$split_headers = $captures[1];
// we have to reissue the headers as is
foreach ($split_headers as $s) {
header($s);
}
// echo the body.
echo $data;
?>
Done!












August 16th, 2007 at 1:02 pm
[...] Well that’s where I come in. I have 2 implementations in beta (read: they work according to my tests, but I’m going to be testing more) that address the problem based on the methods the search engines cite. Then, essentially, we’re using a benign form of cloaking (yes, cloaking!) to make it more difficult for bad bots, proxies, etc. to exploit us. They are located here: [...]
August 16th, 2007 at 3:24 pm
Hey Jamie,
thanks for posting this, but I'd need the config.inc.php as well to run it.
Also I wonder what the "proxy.php" is actually... that one seems to be missing too
thanks
christoph
August 16th, 2007 at 6:11 pm
Jamie,
I just figured that you cannot copy/paste the code from his blog since all those quote characters are replaced by non-code quotes … i.e. they don't work in PHP…
any clue on how to copy the code to a php source?
christoph
August 16th, 2007 at 6:45 pm
[...] esta vulnerabilidad, Google publicó un post para ayudarnos a detectar falsos robots. Además, en este post tenéis una implementación de esta detección utilizando código [...]
August 16th, 2007 at 9:44 pm
I get this error when i run the sql querie in phpmysqladmin
MySQL said: Documentation
#1064 - You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'â€
what did i miss?
copied the code
made changes to config file, uploaded files
ran sql querie..
any ideas?
August 16th, 2007 at 10:01 pm
"SELECT cloak_data.* FROM cloak_data WHERE TRUE"
??
Last I checked, you don't need a where clause just to always return true...
Thanks for your altruistic contribution to the community here though. I don't administer any sites, so none of this applies to me, but it was nice of you to dive in and help with all this.
August 17th, 2007 at 9:53 am
addendum: rewrite itself is working. However; the script isn't rewriting the html to include noindex/nofollow
August 17th, 2007 at 7:49 pm
Got it working thanks for making the changes
now can i test this somehow to see how/if it works?
August 18th, 2007 at 11:01 am
I am wondering if you could answer a few concerns brought up at this forum
http://www.ihelpyou.com/forums/showthread.php?s=&threadid=25480&perpage=10&pagenumber=3
here are a few of them
» The script relies on the accuracy of resources provided by another site in order to work.
• This other site (iplists) -- where does their information come from?
• How do they obtain the IPs of various bots?
• How often is their list updated?
• How accurate is their IP list?
I noticed some of the "bots" they have listed there, are not even real search engine bots. A few of them are bad bots like Naverbot
more are at the forum
thanks
August 18th, 2007 at 10:46 pm
Thanks so much for the help with this problem. I've been using some PHP tricks to deal with bad bots, but nothing this extensive. I do have a problem with implementing this script though. I get an error (Unknown column 'TRUE' in 'where clause') when it queries the database (SELECT cloak_data.* FROM cloak_data WHERE TRUE AND spider_name...) Is this related to the fact that I'm not using the latest and greatest MySQL?
August 21st, 2007 at 3:32 pm
How I can check that the Cloaking is working with my application?
August 22nd, 2007 at 8:18 pm
Jamie,
great script - just having two problem and thought I would post the question. I copied everything you said exactly. There is a descrepancy:
1. You write "Save this file as 'simple_cloak_v2.php.'" but the PHP Implementation refers to the file as simple_cloak_v2.inc.php
""
So I'm confused and tried things naming both ways - no help.
2. I cannot get the update process to populate my database even though the connection is good (no errors) and the script spits back "Cloaking database updated!" - there's still no data populated in the tables.
3. I added the "" to my header.php file (hope that is supposed to work).
The site renders fine only up to the first line (include_once..., etc.) but dies at $_x = SimpleCloaV2, etc... (remarking that line out allows the site to render ok)
Can you help point me in the right place to look for the cause? I am at my witts end!
Thanks in advance,
Taylor
August 23rd, 2007 at 9:25 am
Don't you mean to say that the first script (above) should be saved as "simple_cloak_v2.inc.php"?
August 23rd, 2007 at 1:23 pm
Since the user is claiming to be a SE USER AGENT and is not, would it be better to just simply show no content whatsoever or display a sitemap with links back to the site pages.
August 23rd, 2007 at 3:05 pm
[...] one needs not be a proxy to mirror and attempt to de-index, but that is the current trend. The counter, offered by Thies and designed by Jaimie Sirovich, is a PHP script that must be inserted every page [...]
August 23rd, 2007 at 5:47 pm
Careful when using on a Wordpress site as the database definitions clash.
I got it working by changing them to this:
define("USE_CUSTOM_CONNECT_CODE", 0);
// usually localhost
define("DB_noel_HOST", "$uberhost");
// db user
define("DB_noel_USER", "$uberuser");
// password
define("DB_noel_PASSWORD", "$uberpass");
//db name
define("DB_noel_DATABASE", $uberprefix."_uber");
in the config.inc.php
and then this in the main simple_cloak.v2.php file
function _connect()
{
if (USE_CUSTOM_CONNECT_CODE) return true;
// Connect to MySQL server
$dbLink = mysql_connect(DB_noel_HOST, DB_noel_USER, DB_noel_PASSWORD)
or die("Could not connect: " . mysql_error());
// Connect to the seophp database
mysql_select_db(DB_noel_DATABASE) or die("Could not select database");
return $dbLink;
}
To use on wordpress just add the lines
include_once('simple_cloak_v2.php');
$_x = SimpleCloakV2::metaRobotsExcludeProxies();
to the top of the index.php file in your wordpress directory, after the
August 23rd, 2007 at 5:50 pm
oops - comment was truncated - continues here...
... after the
August 27th, 2007 at 6:27 pm
I did this on just one of my site's front page & as of today there is no cache in google searching from Canada. Dammit.
September 8th, 2007 at 1:08 pm
Sandy asked, "How I can check that the Cloaking is working with my application?"
I just set this up on my site, and wanted to test it out...
Any thoughts?
Martin
October 24th, 2007 at 8:42 am
I've read Dan's post too and wondered how he implemented his proxy-hack-solution. Thanks Jaimie for sharing yours