#!/usr/bin/php
\n"; else $NL="\n";
if ($show_html) $HR="
\n"; else $HR="---------------------------------------------------------------------------------------------------\n";
if ($show_html) $B=""; else $B="";
if ($show_html) $B_=""; else $B_="";
/*
* Start of main()
*/
if ($show_html)
{
echo "";
}
$keywords=explode(",",$extra_keywords);
echo "$NL$B Scraping max. $max_results results for the main keyword \"$main_keyword\" using ".count($keywords)." additional keywords $B_ $NL$NL";
/*
* This loop iterates through all keyword combinations
*/
$ch=NULL;
foreach($keywords as $keyword)
{
if ($max_results<=0) break;
$search_string=urlencode($main_keyword." ".$keyword);
// force new curl session
echo "$NL";
echo "===========================================================================================================================$NL";
echo "Scraping for \"$main_keyword $keyword\" $NL";
echo "===========================================================================================================================$NL";
echo "$NL";
$runs=0;
$res=proxy_api("rotate");
$ip="";
if ($res <= 0)
{
echo "Error: Proxy API connection failed (Error $res).$NL$NL$NL";
sleep(2);
break;
} else
{
echo "API: Received proxy IP $PROXY[external_ip] on port $PROXY[port]$NL";
}
$ch=new_curl_session($ch);
$errors=0;
$ip="";
/*
* This loop iterates through all available google result pages
*/
while (1)
{
if ($max_results<=0) break;
$runs++;
echo "Run $runs \t Page $page \t loading$NL";
if ((!$ip) || ($ip == ""))
$ip=getip($ch); // Test of the external IP and if the current proxy is ready
if ((!$ip) || ($ip == "")) // If the proxy didn't work: rotate to next proxy
{
echo "Proxy is not working, rotating ..$NL";
$res=proxy_api("rotate");
$ip="";
if ($res <= 0)
{
echo "Error: API connection failed (Error $res), retry.$NL$NL$NL";
sleep (10);
continue;
} else
{
echo "API: Received proxy IP $PROXY[external_ip] on port $PROXY[port]$NL";
}
$ch=new_curl_session($ch);
continue;
}
echo "Current tested IP-Address: $ip$NL$NL";
$google_ip="www.google.com"; // hidden potential left
if ($page == 0)
{
// we imitate a firefox browser search and will query for 100 results
$url="http://$google_ip/search?q=$search_string&ie=utf-8&as_qdr=all&aq=t&rls=org:mozilla:us:official&client=firefox&num=100";
} else
{
$num=$page*100;
$url="http://$google_ip/search?q=$search_string&ie=utf-8&as_qdr=all&aq=t&rls=org:mozilla:us:official&client=firefox&start=$num&num=100";
}
echo "Search URL: $url$NL";
curl_setopt ($ch, CURLOPT_URL, $url);
$htmdata = curl_exec ($ch);
$newtry=0;
if (!$htmdata)
{
$error = curl_error($ch);
$info = curl_getinfo($ch);
echo "\tError browsing: $error [ $info ]$NL";
sleep (3);
$newtry=1;
}
if (strstr($htmdata,"computer virus or spyware application"))
{
echo("Captcha error is popping up ! We need more proxies !");
die();
$newtry=1;
}
if (strstr($htmdata,"entire network is affected"))
{
echo("Google blocked us, we need more proxies !$NL");
die();
$newtry=1;
}
if (strstr($htmdata,"http://www.download.com/Antivirus"))
{
echo("Google blocked us, we need more proxies !$NL");
die();
$newtry=1;
}
if ($newtry)
{
if ($errors++ > 3)
{
echo "Abort: too many google errors! $NL$NL";
sleep(5);
break;
}
$res=proxy_api("rotate");
$ip="";
if ($res <= 0)
{
echo "Error: API connection failed (Error $res), retry.$NL$NL$NL";
sleep (10);
} else
{
echo "API: Received proxy IP $PROXY[external_ip] on port $PROXY[port]$NL";
}
echo "Rotated IP and retrying$NL";
$ch=new_curl_session($ch);
continue;
}
$skip=0;
// now we test if (more) results are available
if (strstr($htmdata,"/images/yellow_warning.gif"))
{
echo "No (more) results left$NL";
$skip=1;
}
if (!$skip)
{
$len=strlen($htmdata);
echo "\t Received $len bytes$NL";
// Now we parse the html content, putting it into a DOM tree
$dom = new domDocument;
$dom->strictErrorChecking = false;
$dom->preserveWhiteSpace = true;
@$dom->loadHTML($htmdata);
$lists=$dom->getElementsByTagName('li');
$num=0;
foreach ($lists as $list)
{
unset($ar);unset($divs);unset($div);unset($cont);unset($result);unset($tmp);
$result['main_keyword']=$main_keyword;
$result['sub_keyword']=$keyword;
$ar=dom2array_full($list);
if (count($ar) < 2)
{
echo "S";
continue; // skipping advertisement and similar spam
}
if ((!isset($ar['class'])) || ($ar['class'] != 'g'))
{
echo "?";
continue; // skipping non-search results
}
// adaption to new google layout
//if ($num==2)var_dump($ar);
//if ($num==3)var_dump($ar);
if (isset($ar['div'][1]))
$ar['div']=&$ar['div'][0];
if (isset($ar['div'][1]))
$ar['div']=&$ar['div'][0];
//$ar=&$ar['div']['span']; // Google removed the span
//$ar=&$ar['div']; // change 2012-2013, commented out again
// adaption finished
$divs=$list->getElementsByTagName('div');
$div=$divs->item(1);
getContent($cont,$div);
$num++;
$result['title']=&$ar['h3']['a']['textContent'];
$tmp=strstr(&$ar['h3']['a']['@attributes']['href'],"http");
$result['url']=$tmp;
if (strstr(&$ar['h3']['a']['@attributes']['href'],"interstitial")) echo "!";
$tmp=parse_url(&$result['url']);
$result['host']=&$tmp['host'];
if (strstr($cont,"...
")) // remove some dirt behind the description
{
$result['desc']=substr($cont,0,strpos($cont,"...
"));
} else
if (strstr($cont,"getElementsByTagName('table');
if (strstr($htmdata,"Next")) $next=1;
else
{
$needstart=($page+1)*100;
$findstr="start=$needstart";
if (strstr($htmdata,$findstr)) $next=1;
}
$page++;
}
if (!$next)
{
echo("Finished $runs runs on current search, last google page was $page$NL");
break;
}
}
}
/* Instead of just outputting the data it might be more useful to put it into a database ? */
echo "$NL$NL";
echo "$B Scraping of keywords finished$B_ $NL";
foreach ($results as $result)
{
echo $HR;
echo "$B Keyword:$B_ $result[main_keyword] $result[sub_keyword]$NL";
echo "$B Host:$B_ $result[host]$NL";
echo "$B URL:$B_ $result[url]$NL";
echo "$B Title:$B_ $result[title]$NL";
echo "$B Desc:$B_ $result[desc]$NL";
echo $NL;
}
if ($show_html)
{
echo "";
}
?>