- Aug. 16th, 2007
- 29 comments
Note: I didn't realize WP changes quotes to curly quotes to look "pretty" since version 2.1. I turned that feature off. Cut and pasting should work.
Below is the main class necessary for the cloaking functionality, "SimpleCloakV2:"
<?php
$__metaRobotsExcludeProxiesCallbackHTML = ";
/*
// +----------------------------------------------------------------------+
// | SimpleCloakV2 Version 2 |
// | Class for cloaking content |
// | http://www.SEOEgghead.com |
// +----------------------------------------------------------------------+
// | Copyright (c) 2005-2006 Jaimie Sirovich and Cristian Darie |
// +----------------------------------------------------------------------+
*/
// load configuration file
require_once('config.inc.php');
class SimpleCloakV2
{
function _connect()
{
if (USE_CUSTOM_CONNECT_CODE) return true;
// Connect to MySQL server
$dbLink = mysql_connect(DB_HOST, DB_USER, DB_PASSWORD)
or die("Could not connect: " . mysql_error());
// Connect to the seophp database
mysql_select_db(DB_DATABASE) or die("Could not select database");
return $dbLink;
}
function _close($dbLink)
{
if (USE_CUSTOM_CONNECT_CODE) return true;
// close database connection
mysql_close($dbLink);
}
// returns the confidence level
function isSpider($spider_name = ", $check_uas = true, $check_ips = true, $use_user_defined_data = true, $ignore_bad_uas = true)
{
// default confidence level to 0
$confidence = 0;
// matching user agent?
if ($check_uas)
if (SimpleCloakV2::_get(0, $spider_name, 'UA', $_SERVER['HTTP_USER_AGENT'], ", $use_user_defined_data ? " : 'N', $ignore_bad_uas ? 'bad' : "))
$confidence += 2;
// matching IP?
if ($check_ips)
if (SimpleCloakV2::_get(0, $spider_name, 'IP', ", $_SERVER['REMOTE_ADDR'], $use_user_defined_data ? " : 'N', $ignore_bad_uas ? 'bad' : "))
$confidence += 3;
// return confidence level
return $confidence;
}
// retrieve cloaking data filtered by the supplied parameters
function _get($id = 0, $spider_name = ", $record_type = ",
$value = ", $wildcard_value = ", $is_user_defined_data = ", $not_spider_name = ")
{
// by default, retrieve all records
$q = " SELECT cloak_data.* FROM cloak_data WHERE TRUE ";
// add filters
if ($id) {
$id = (int) $id;
$q .= " AND id = $id ";
}
if ($spider_name) {
$spider_name = mysql_escape_string($spider_name);
$q .= " AND spider_name = '$spider_name' ";
}
if ($record_type) {
$record_type = mysql_escape_string($record_type);
$q .= " AND record_type = '$record_type' ";
}
if ($value) {
$value = mysql_escape_string($value);
$q .= " AND value = '$value' ";
}
if ($wildcard_value) {
$wildcard_value = mysql_escape_string($wildcard_value);
$q .= " AND ( '$wildcard_value' = value OR '$wildcard_value' LIKE CONCAT(value, '.%') ) ";
}
if ($is_user_defined_data) {
$is_user_defined_data = mysql_escape_string($is_user_defined_data);
$q .= " AND is_user_defined_data = '$is_user_defined_data' ";
}
if ($not_spider_name) {
$not_spider_name = mysql_escape_string($not_spider_name);
$q .= " AND spider_name <> '$not_spider_name' ";
}
$dbLink = SimpleCloakV2::_connect();
// execute the query
$tmp = mysql_query($q);
SimpleCloakV2::_close($dbLink);
// return the results as an associative array
$rows = array();
while ($_x = mysql_fetch_assoc($tmp)) {
$rows[] = $_x;
}
return $rows;
}
// updates the entire database with fresh spider data, but only if our data is
// more than 7 days old, and if the online version from iplists.org has changed
function updateAll($delete_user_defined_data = false)
{
$dbLink = SimpleCloakV2::_connect();
// retrieve last update information from database
$q = "SELECT cloak_update.* FROM cloak_update";
$tmp = mysql_query($q);
$updated = mysql_fetch_assoc($tmp);
$db_version = $updated['version'];
$updated_on = $updated ['updated_on'];
// get the latest update more recent than 7 days, don't attempt an update
if (isset($updated_on) &&
(strtotime($updated_on) > strtotime("-604800 seconds")))
{
// close database connection
SimpleCloakV2::_close($dbLink);
// return false to indicate an update wasn't performed
return false;
}
// read the latest iplists version
$version_url = 'http://www.iplists.com/nw/version.php';
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $version_url);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
$latest_version = curl_exec($ch);
curl_close($ch);
$latest_version = mysql_escape_string($latest_version);
// if no updated version information was retrieved, abort
if (!$latest_version)
{
// return false to indicate an update wasn't performed
return false;
}
// save the update data
$q = "DELETE FROM cloak_update";
mysql_query($q);
$q = "INSERT INTO cloak_update (version, updated_on) " .
"VALUES('$latest_version', NOW())";
mysql_query($q);
// if we already have the current data, don't attempt an update
if ($latest_version == $db_version)
{
// close database connection
mysql_close($dbLink);
// return false to indicate an update wasn't performed
return false;
}
// update the database
SimpleCloakV2::_updateCloakingDB('google',
'http://www.iplists.com/nw/google.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('yahoo',
'http://www.iplists.com/nw/inktomi.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('msn',
'http://www.iplists.com/nw/msn.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('ask',
'http://www.iplists.com/nw/askjeeves.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('altavista',
'http://www.iplists.com/nw/altavista.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('lycos',
'http://www.iplists.com/nw/lycos.txt', $delete_user_defined_data);
SimpleCloakV2::_updateCloakingDB('wisenut',
'http://www.iplists.com/nw/wisenut.txt', $delete_user_defined_data);
// close connection
SimpleCloakV2::_close($dbLink);
// return true to indicate a successful update
return true;
}
// update the database for the mentioned spider, by reading the provided URL
function _updateCloakingDB($spider_name, $url, $delete_user_defined_data = false)
{
$ua_regex = '/^# UA "(.*)"$/m';
$ip_regex = '/^([0-9.]+)$/m';
// use cURL to read the data from $url
// NOTE: additional settings are required when accessing the web through a proxy
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 1);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt ($ch, CURLOPT_TIMEOUT, 60);
$result = curl_exec($ch);
curl_close($ch);
// use _parseListURL to parse the list of IPs and user agents
$lists = SimpleCloakV2::_parseListURL($result, $ua_regex, $ip_regex);
// if the user agents and IPs weren't retrieved, we cancel the update
if (!$lists['ua_list'] || !$lists['ip_list']) return;
// lock the cloack_data table to avoid concurrency problems
mysql_query('LOCK TABLES cloak_data WRITE');
// delete all the existing data for $spider_name
SimpleCloakV2::_deleteSpiderData($spider_name, $delete_user_defined_data ? " : 'N');
// insert the list of user agents for the spider
foreach ($lists['ua_list'] as $ua) {
SimpleCloakV2::_insertSpiderData($spider_name, 'UA', $ua);
}
// insert the list of IPs for the spider
foreach ($lists['ip_list'] as $ip) {
SimpleCloakV2::_insertSpiderData($spider_name, 'IP', $ip);
}
// release the table lock
mysql_query('UNLOCK TABLES');
}
// helper function used to parse lists of user agents and IPs
function _parseListURL($data, $ua_regex, $ip_regex)
{
$ua_list_ret = preg_match_all($ua_regex, $data, $ua_list);
$ip_list_ret = preg_match_all($ip_regex, $data, $ip_list);
return array('ua_list' => $ua_list[1], 'ip_list' => $ip_list[1]);
}
// inserts a new row of data to the cloaking table
function _insertSpiderData($spider_name, $record_type, $value, $is_user_defined = 'N')
{
// escape input data
$spider_name = mysql_escape_string($spider_name);
$record_type = mysql_escape_string($record_type);
$value = mysql_escape_string($value);
$is_user_defined = mysql_escape_string($is_user_defined);
// build and execute the INSERT query
$q = "INSERT INTO cloak_data (spider_name, record_type, value, is_user_defined) " .
"VALUES ('$spider_name', '$record_type', '$value', '$is_user_defined')";
mysql_query($q);
}
// delete the cloaking data for the mentioned spider
function _deleteSpiderData($spider_name, $is_user_defined = ")
{
// escape input data
$spider_name = mysql_escape_string($spider_name);
// build and execute the DELETE query
$q = "DELETE FROM cloak_data WHERE spider_name='$spider_name'";
if ($is_user_defined) {
$is_user_defined = mysql_escape_string($is_user_defined);
$q .= " AND is_user_defined = '$is_user_defined' ";
}
mysql_query($q);
}
// only use if it's not found via the IPLists cloaking database
function botVerifyByDNS($ua = array('google', '#.*\.googlebot\.com$#'))
{
// check cache of bad bots
if (SimpleCloakV2::isSpider('bad', false, true, true, false)) {
return false;
}
// check only UA since this function is only called if the cloaking DB doesn't handle it
if (SimpleCloakV2::isSpider($ua[0], true, false)) {
// reverse lookup
$host_name = gethostbyaddr($_SERVER['REMOTE_ADDR']);
// if it says it's a certain UA but gethostbyaddr the corresponding domain regex, store it and then abort
if (!preg_match($ua[1], $host_name)) {
$dbLink = SimpleCloakV2::_connect();
SimpleCloakV2::_insertSpiderData('bad', 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
SimpleCloakV2::_close($dbLink);
return false;
}
$connected_ip_address = $_SERVER['REMOTE_ADDR'];
$host_name_ip_address = gethostbyname($host_name);
// if the connected IP matches the authoritative IP, we have a match
if ($connected_ip_address == $host_name_ip_address) {
$dbLink = SimpleCloakV2::_connect();
SimpleCloakV2::_insertSpiderData($ua[0], 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
SimpleCloakV2::_close($dbLink);
return true;
} else {
// if it says it's a certain UA, gethostbyaddr says the right thing, but gethostbyname does not
$dbLink = SimpleCloakV2::_connect();
SimpleCloakV2::_insertSpiderData('bad', 'IP', $_SERVER['REMOTE_ADDR'], 'Y');
SimpleCloakV2::_close($dbLink);
return false;
}
}
// it does not even say it's a bot via UA
return false;
}
function _addMetaRobotsExcludeProxiesCallback($buffer)
{
global $__metaRobotsExcludeProxiesCallbackHTML;
return preg_replace('#</title>#', '</title>' . $__metaRobotsExcludeProxiesCallbackHTML, $buffer);
}
function metaRobotsExcludeProxies($auto_modify_content = true, $uas = array(array('google', '#.*\.googlebot\.com$#'), array('yahoo', '#.*\.yahoo\.net$#'), array('msn', '#.*\.live\.com$#'), array('ask', '#.*\.ask.com$#') ), $meta_tag = '<meta name="robots" content="noindex,nofollow" />', $passlist_regex = ")
{
global $__metaRobotsExcludeProxiesCallbackHTML;
if ($meta_tag)
$__metaRobotsExcludeProxiesCallbackHTML = $meta_tag;
// if it's on our passlist
// ex: #become|lycos|somestupidbot#
if ($passlist_regex) {
if (preg_match($passlist_regex, $_SERVER['HTTP_USER_AGENT'])) return false;
}
foreach ($uas as $u) {
// if it's a bot according to UA, then start to investigate
if (SimpleCloakV2::isSpider($u[0], true, false)) {
// if it's a bot according to IPLists or our user-defined list
if (SimpleCloakV2::isSpider($u[0], false, true)) {
return false;
// if it's a bot according to DNS
} else if (SimpleCloakV2::botVerifyByDNS($u)) {
return false;
// if it's not
} else {
if ($auto_modify_content) ob_start(array('SimpleCloakV2′, '_addMetaRobotsExcludeProxiesCallback'));
return true;
}
}
}
// it's not a bot according to UA
if ($auto_modify_content) ob_start(array('SimpleCloakV2′, '_addMetaRobotsExcludeProxiesCallback'));
return true + 1;
}
}
?>
Save this file as "simple_cloak_v2.php."
You will also need the configuration file (it is referenced in "simple_cloak_v2.php"):
<?php
// defines database connection data
// set to "1″ if you are already connected in your application.
define("USE_CUSTOM_CONNECT_CODE", 0);
// usually localhost
define("DB_HOST", "your_db_host");
// db user
define("DB_USER", "some_user");
// password
define("DB_PASSWORD", "secret");
//db name
define("DB_DATABASE", "your_db");
?>
Save this as "config.inc.php."
Then, to implement:
Use this SQL to create the database tables needed for the SimpleCloakV2 class
Run the following queries in your mySQL database (using the mysql binary or phpmysqladmin):
CREATE TABLE `cloak_data` (
`id` int(11) NOT NULL auto_increment,
`spider_name` varchar(255) NOT NULL default ",
`record_type` enum('UA','IP') NOT NULL default 'UA',
`value` varchar(255) NOT NULL default ",
`is_user_defined` enum('N','Y') NOT NULL default 'N',
PRIMARY KEY (`id`),
KEY `value` (`value`)
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
CREATE TABLE `cloak_update` (
`version` varchar(255) NOT NULL default ",
`updated_on` datetime NOT NULL default '0000-00-00 00:00:00′
) ENGINE=MyISAM DEFAULT CHARSET=latin1;
Only if you already have a "cloak_data" table (from our book or a previous version of SimpleCloak on the blog), run this SQL:
ALTER TABLE cloak_data ADD `is_user_defined` ENUM('N','Y') NOT NULL;
Populate the Cloaking database with the data from IPLists.com
Note: This should be run periodically from a cron job to keep the data updated. It will update only once a week regardless. However, you may also put it in the footer of an application.
<?php
// load the SimpleCloakV2 library
require_once 'simple_cloak_v2.inc.php';
// update cloaking data and indicate the success status
if (SimpleCloakV2::updateAll())
{
echo "Cloaking database updated!";
}
else
{
echo "Cloaking database was already up to date, or the update failed.";
}
?>
Then pick *1* of the following methods.
Note: Method #2 is a bit of a kludge, as the RewriteMap directive of Apache cannot be used in .htaccess. *It has not been tested extensively yet!*
METHOD NUMBER 1 — PHP Implementation
Place this code at the top of your application (or relevant parts thereof):
<?
include_once('simple_cloak_v2.inc.php');
$_x = SimpleCloakV2::metaRobotsExcludeProxies();
?>
The code automatically inserts the meta tag using PHP output buffering. If you want a more custom/efficient solution, that is also possible. See the first parameter of function "metaRobotsExcludeProxies." Set to false, it will not use the output buffering, and you may use the result to effect changes in your application as desired.
METHOD NUMBER 2 — .htaccess Implementation
Place this in your .htaccess file
RewriteEngine On
RewriteCond %{HTTP_USER_AGENT} yahoo|slurp|msn|ask|google|gsa [NC]
RewriteRule (^.*$) proxy.php?orig_url=$1
And this is the code for proxy.php:
<?
include ('simple_cloak_v2.inc.php');
// should we deny access?
if (SimpleCloakV2::metaRobotsExcludeProxies(false)) {
header("HTTP/1.0 403 Forbidden");
echo 'forbidden … ';
exit();
}
// otherwise echo as it was …
// construct the original URL
$url = $_SERVER['SERVER_NAME'] . '/' . $_SERVER['REQUEST_URI'];
// get the contents
$ch = curl_init();
curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 1);
curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
$result = curl_exec($ch);
curl_close($ch);
// do some parsing
preg_match("#(.*)\r\n\r(.*)#s", $result, $captures);
$headers = $captures[1];
$data = $captures[2];
preg_match_all('#(.*)\r#m', $headers, $captures);
$split_headers = $captures[1];
// we have to reissue the headers as is
foreach ($split_headers as $s) {
header($s);
}
// echo the body.
echo $data;
?>
Done!
"29 Wise Comments Banged Out Somewhere On The Internet ..."
Hey Jamie, thanks for posting this, but I'd need the config.inc.php as well to run it. Also I wonder what the "proxy.php" is actually… that one seems to be missing too thanks
Jamie, I just figured that you cannot copy/paste the code from his blog since all those quote characters are replaced by non-code quotes … i.e. they don't work in PHP… any clue on how to copy the code to a php source? christoph
I get this error when i run the sql querie in phpmysqladmin MySQL said: Documentation what did i miss?
"SELECT cloak_data.* FROM cloak_data WHERE TRUE" Last I checked, you don't need a where clause just to always return true… Thanks for your altruistic contribution to the community here though. I don't administer any sites, so none of this applies to me, but it was nice of you to dive in and help with all this.
addendum: rewrite itself is working. However; the script isn't rewriting the html to include noindex/nofollow
Got it working thanks for making the changes
I am wondering if you could answer a few concerns brought up at this forum here are a few of them • This other site (iplists) — where does their information come from? I noticed some of the "bots" they have listed there, are not even real search engine bots. A few of them are bad bots like Naverbot more are at the forum thanks
Thanks so much for the help with this problem. I've been using some PHP tricks to deal with bad bots, but nothing this extensive. I do have a problem with implementing this script though. I get an error (Unknown column 'TRUE' in 'where clause') when it queries the database (SELECT cloak_data.* FROM cloak_data WHERE TRUE AND spider_name…) Is this related to the fact that I'm not using the latest and greatest MySQL?
Jamie, great script - just having two problem and thought I would post the question. I copied everything you said exactly. There is a descrepancy: 1. You write "Save this file as 'simple_cloak_v2.php.'" but the PHP Implementation refers to the file as simple_cloak_v2.inc.php "" So I'm confused and tried things naming both ways - no help. 2. I cannot get the update process to populate my database even though the connection is good (no errors) and the script spits back "Cloaking database updated!" - there's still no data populated in the tables. 3. I added the "" to my header.php file (hope that is supposed to work). The site renders fine only up to the first line (include_once…, etc.) but dies at $_x = SimpleCloaV2, etc… (remarking that line out allows the site to render ok) Can you help point me in the right place to look for the cause? I am at my witts end! Thanks in advance, Taylor
Don't you mean to say that the first script (above) should be saved as "simple_cloak_v2.inc.php"?
Since the user is claiming to be a SE USER AGENT and is not, would it be better to just simply show no content whatsoever or display a sitemap with links back to the site pages.
Careful when using on a Wordpress site as the database definitions clash. I got it working by changing them to this: define("USE_CUSTOM_CONNECT_CODE", 0); in the config.inc.php and then this in the main simple_cloak.v2.php file function _connect() // Connect to the seophp database return $dbLink; To use on wordpress just add the lines include_once('simple_cloak_v2.php'); to the top of the index.php file in your wordpress directory, after the
I did this on just one of my site's front page & as of today there is no cache in google searching from Canada. Dammit.
Sandy asked, "How I can check that the Cloaking is working with my application?" I just set this up on my site, and wanted to test it out… Any thoughts? Martin
I've read Dan's post too and wondered how he implemented his proxy-hack-solution. Thanks Jaimie for sharing yours
Thanks for the advice and code.I'm stuck with the auto updater. Its not allowing bots LOL. "Forbidden Additionally, a 500 Internal Server Error error was encountered while trying to use an ErrorDocument to handle the request." I'd prefer not to copy and paste into the database.
I think he got sick of everyone automating it. So add a USER_AGENT setting to cURL and I think it'll work. Let me know and I'll repost the code saying MSIE 6.0 or something.
I have been trying for a couple of days to get this to work by copy pasting it into a php file in dreamweaver. I am not sure what is wrong but i cant get the code to work out of the box. Foe example it seems that the php syntax doesn't highlight correctly and when saved and run it has parsing errors. Can you help me out with getting this working? Any help would be appreciated. Thanks
Jaimie, Re. UA, yes that works. As this is v2 of SimpleCloak and I've yet to find some elsewhere, can you please provide more context as to this upgrade? Looks like it's for adding proxies supports, however v1 is what's still linked from the "software" section hence v2's still in development(?)… all correct? Also, question for back on v1: if upon updates the table cloak)update is getting populated OK yet cloak_data isn't, how to troubleshoot? Cheers,
is cloakin really any dood, i alway thought its all about links not content……….
Our site has been proxy hacked by https://peek-a-boo.appspot.com does anyone know what to do? We tried contacting google but no response
Jaimie, I have implemented the script on one of my sites that was being plagued by all sorts of black hat seo tricks. I couldn't just copy and paste, I had to type everything, but it was well worth it. After 10 days, all pages that use the script are still in google with cache and there are no reports of pages not being followed in the webmaster tools. So I guess everything is working as it should. What's even better, is that most proxy-indexed pages have disappeared from Google.
When you use this method, don't forget to add the new user agent info for Bing and a number of new IPs for the Bing crawler (look in your server logs). The iplists don't have this information and it will cause your site to be de-listed from Bing and perhaps Yahoo (one of the Bing and Yahoo crawlers seem to share 1 IP). Set the new information in your database as "user-defined" to yes.
Method #1 does not work with Magento, as the noindex,nofollow tags get filtered out. Magento also works with a cache, which makes it more complicated. Method #2 should work with Magento, as I have tested with the User Agent Switcher plugin for Firefox. However, if you use "fetch as Googlebot" through your Webmaster account at Google, then you'll get the error that the page is unreachable (404). Anybody has any ideas? SEO Egghead by Jaimie Sirovich » How To Guide: Prevent Google Proxy Hacking[...] Well that's where I come in. I have 2 implementations in beta (read: they work according to my tests, but I'm going to be testing more) that address the problem based on the methods the search engines cite. Then, essentially, we're using a benign form of cloaking (yes, cloaking!) to make it more difficult for bad bots, proxies, etc. to exploit us. They are located here: [...] refugeenet.org Blog » Bug en Google: tu sitio web puede ser penalizado en el buscador mediante un ‘ataque proxy’[...] esta vulnerabilidad, Google publicó un post para ayudarnos a detectar falsos robots. Además, en este post tenéis una implementación de esta detección utilizando código [...] The Binary Cult Blog » Blog Archive » Google SEO DoS Exploit[...] one needs not be a proxy to mirror and attempt to de-index, but that is the current trend. The counter, offered by Thies and designed by Jaimie Sirovich, is a PHP script that must be inserted every page [...]
|
















