Scraping Amazon Ranks Code

Our main class, amazon_rank_finder.php

<?php error_reporting(E_ALL ^ E_STRICT); ini_set('display_errors', 'on'); define('DEFAULT_ASIN', "1430235608"); $asin = DEFAULT_ASIN; if (isset($_POST) && isset($_POST['asin'])) { $asin = preg_replace("/[^a-zA-Z0-9]/", "", $_POST['asin']); } ?> <!doctype HTML> <html> <head> <title>Amazon Rank Finder</title> <link rel="stylesheet" href="style.css"/> </head> <body> <form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="POST"> <input value="<?php echo str_pad($asin, 10, '0', STR_PAD_LEFT); ?>" type="text" name="asin" /> <input type="Submit"/> </form> <div> <?php require_once('config.php'); require_once('xml_io.php'); require_once('scraper.php'); require_once('view.php'); class AmazonRankFinder { private $id = 0; private $domains = null; //objects private $model = null; private $view = null; public function __construct() { $this->model = new XML_IO; $this->view = new View; } public function run($id, Array $domains = array('ca', 'com', 'co.uk')) { $this->domains = $domains; $this->id = $id; $this->model->loadXML($this->id); if ($this->timeToCheck()) { $this->checkRemoteContent(); $this->model->updateAndWriteFile(); } else { $this->checkCachedContent(); } $this->view->display($this->id); } private function timeToCheck() { return (time() - $this->model->getLastCheckTime() > RECHECK_INTERVAL); } private function checkRemoteContent() { $need_title_and_image = true; $this->view->appendToBody("Checking id: $this->id now...<br/>"); foreach ($this->domains as $domain) { $results = Scraper::fetchUpdatedData($domain, $this->id, $need_title_and_image); if ($results) { if ($need_title_and_image) { $this->model->setTitle($results['title']); $this->model->setImage($results['image']); $this->view->appendToBody(html_entity_decode($this->view->productInformationAsHTML( $results['title'], $results['image']))); $need_title_and_image = false; $this->view->appendToBody("<div style='float: left;'>"); } //adds best/worst view $ranks = $this->model->updateXML($domain, $results['sales_rank']); if (!empty($ranks)) { $this->view->appendToBody(html_entity_decode( $this->view->domainRankAsHTML($this->id, $domain, $ranks, false))); } } } $this->view->appendToBody('</div>'); } private function checkCachedContent() { $this->view->appendToBody(html_entity_decode($this->view->productInformationAsHTML( $this->model->getTitle(), $this->model->getImage()))); $this->view->appendToBody("<div style='float: left;'>Last checked id: $this->id at " . date('m-d-Y h:i:sa', $this->model->getLastCheckTime()) . "<br/>"); foreach ($this->domains as $domain) { $site_node = $this->model->siteExistsInXML($domain); $ranks = $this->model->queryRankNodesFromXML($site_node); if (!empty($ranks)) { $this->view->appendToBody(html_entity_decode( $this->view->domainRankAsHTML($this->id, $domain, $ranks))); } } $this->view->appendToBody('</div>'); } } if (isset($_POST) && isset($_POST['asin'])) { $amazon_rank_finder = new AmazonRankFinder(); $amazon_rank_finder->run($asin, array('ca', 'com', 'co.uk')); } ?> </div> </body> </html>

Our config.php file

<?php define('RECHECK_INTERVAL', 60 * 60); //one hour define('SALES_RANK_DOM_ID', "#SalesRank"); define('IMAGE_DOM_ID', "#prodImage"); define('TITLE_DOM_ID', "#btAsinTitle"); define('AMAZON_URL_PREFIX', 'http://www.amazon.'); define('AMAZON_URL_SUFFIX', '/dp/'); date_default_timezone_set('America/Regina'); ?>

Our scraper.php file

<?php require_once('config.php'); require_once('phpQuery.php'); class Scraper { public static function fetchUpdatedData( $domain, $id, $get_title = false ) { $title = $image = ""; //http://www.amazon.ca/dp/1234567890 $url = AMAZON_URL_PREFIX.$domain.AMAZON_URL_SUFFIX.$id; $contents = file_get_contents( $url ); if ($contents === false) { //return early if URL is not found return false; } phpQuery::newDocument( $contents ); if ($get_title) { $title = pq( TITLE_DOM_ID ); $image = pq( IMAGE_DOM_ID ); } return array('sales_rank' => pq( SALES_RANK_DOM_ID ), 'title' => $title, 'image' => $image); } public static function parseRankFromDescription( $description ) { preg_match_all( '{Amazon Bestsellers Rank:</b>\s*#?(([0-9]{0,3},)?([0-9]{0,3},)?[0-9]{1,3}) in Books}mi', $description, $matches ); if (isset( $matches[ 1 ] ) && isset( $matches[ 1 ][ 0 ] )) { return intval( str_replace( ',', '', $matches[ 1 ][ 0 ] ) ); } return -1; } } ?>

Our xml_ui.php file

<?php require_once('config.php'); $default_xml_string = <<<EOT <?xml version="1.0" encoding="UTF-8" ?> <info> <lastcheck></lastcheck> <title></title> <image></image> <sites/> </info> EOT; define( 'DEFAULT_XML_STRING', $default_xml_string ); class Xml_IO { private $dom = null; private $id = 0; private $filename = null; //nodes private $lastchecktime_node = null; private $title_node = null; private $image_node = null; //node textcontent private $lastchecktime = 0; private $title = null; private $image = null; public function __construct() { $this->dom = new DOMDocument(); } public function createFileIfItDoesNotExist() { if (!file_exists( $this->filename )) { file_put_contents( $this->filename, DEFAULT_XML_STRING ); } } public function loadXML( $id ) { $this->id = $id; $this->filename = $this->id.'.xml'; $this->createFileIfItDoesNotExist(); $this->dom->load( $this->filename ); $this->xpath = new DomXpath( $this->dom ); $this->lastchecktime_node = $this->xpath->query( "//lastcheck" )->item( 0 ); $this->title_node = $this->xpath->query( "//title" )->item( 0 ); $this->image_node = $this->xpath->query( "//image" )->item( 0 ); $this->sites_node = $this->xpath->query( "//sites" )->item( 0 ); $this->lastchecktime = (int) $this->lastchecktime_node->textContent; $this->title = $this->title_node->textContent; $this->image = $this->image_node->textContent; } public function updateXML( $domain, $sales_rank ) { if ($this->siteExistsInXML( $domain )) { //echo "update<br/>"; return $this->updateSiteInXML( $domain, $sales_rank ); } else { //echo "append<br/>"; return $this->appendSiteInXML( $domain, $sales_rank ); } } public function siteExistsInXML( $domain ) { $sites = $this->xpath->query( "//domain" ); foreach ($sites as $site) { if ($site->attributes->getNamedItem( "cc" )->textContent == $domain) { return $site; } } return false; } private function updateSiteInXML( $domain, $sales_rank ) { $ranks = null; foreach ($this->sites_node->childNodes as $site) { if ($site->attributes->getNamedItem( "cc" )->textContent == $domain) { try { $ranks = $this->updateDomainRank( $site, $sales_rank ); } catch (Exception $e) { var_dump( $e ); } break; } } return $ranks; } private function appendSiteInXML( $domain, $sales_rank ) { $site = $this->dom->createElement( "domain" ); $cc = $this->dom->createAttribute( "cc" ); $cc->value = $domain; $site->appendChild( $cc ); $this->sites_node->appendChild( $site ); $rank = $this->createAndAppend( $site, "current_rank", $sales_rank ); $exact_rank = Scraper::parseRankFromDescription( $sales_rank ); $this->createAndAppend( $site, "best_rank", $exact_rank ); $this->createAndAppend( $site, "worst_rank", $exact_rank ); return $this->queryRankNodesFromXML( $site ); } private function createAndAppend( $parent, $child_node_name, $value ) { $child = $this->dom->createElement( $child_node_name, $value ); $parent->appendChild( $child ); return $child; } private function updateNode( $node, $value ) { if ($node) { if ($node->firstChild) { $node->replaceChild( $this->dom->createTextNode( $value ), $node->firstChild ); } else { $node->appendChild( $this->dom->createTextNode( $value ) ); } } } public function queryRankNodesFromXML( $site_node ) { if ($site_node) { $current = $this->xpath->query( "current_rank", $site_node )->item( 0 ); $exact = Scraper::parseRankFromDescription( $current->textContent ); $best = $this->xpath->query( "best_rank", $site_node )->item( 0 ); $worst = $this->xpath->query( "worst_rank", $site_node )->item( 0 ); return array('current' => $current, 'exact' => $exact, 'best' => $best, 'worst' => $worst); } return array(); } private function updateDomainRank( $site, $sales_rank ) { $this->updateNode( $this->xpath->query( "current_rank", $site )->item( 0 ), $sales_rank ); $ranks = $this->queryRankNodesFromXML( $site ); if (!empty( $ranks )) { if ($ranks[ 'exact' ] != -1) { if ($ranks[ 'exact' ] < intval( $ranks[ 'best' ]->textContent )) { $this->updateNode( $ranks[ 'best' ], $ranks[ 'exact' ] ); } if ($ranks[ 'exact' ] > intval( $ranks[ 'worst' ]->textContent )) { $this->updateNode( $ranks[ 'worst' ], $ranks[ 'exact' ] ); } } } return $ranks; } public function updateAndWriteFile() { try { $this->updateNode( $this->lastchecktime_node, time() ); $this->updateNode( $this->title_node, (htmlentities( $this->title ) ) ); $this->updateNode( $this->image_node, (htmlentities( $this->image ) ) ); file_put_contents( $this->filename, utf8_encode( $this->dom->saveXML() ) ); } catch (Exception $e) { var_dump( $e ); } } public function getLastCheckTime() { return $this->lastchecktime; } public function getTitle() { return $this->title; } public function getImage() { return $this->image; } public function setTitle( $title ) { $this->title = $title; } public function setImage( $image ) { $this->image = $image; } } ?>

Our view.php file

<?php class View { private $body; public function __construct() { $this->body = ""; } public function appendToBody( $input ) { $this->body .= $input; } public function productInformationAsHTML( $title, $image ) { $html = '<div style="float: left; width: 240px; margin-right: 20px;"><h2>'.$title."</h2>"; $html .= $image; $html .= '</div>'; return $html; } public function domainRankAsHTML( $asin, $domain, $ranks, $entities = true ) { $html = "<div class='domainContainer'>"; if (!empty( $ranks )) { $html .= "<span class='domainName'>"; $html .= "<a href='".AMAZON_URL_PREFIX.$domain.AMAZON_URL_SUFFIX.$asin."' rel='external'/>"; $html .= $domain."</a>"; $html .= "</span><br/>"; $html .= "<div class='domainRank'>"; if ($entities) { $html .= htmlentities( $ranks[ 'current' ]->textContent, ENT_QUOTES, 'UTF-8' ); } else { $html .= $ranks[ 'current' ]->textContent; } $html .= "<strong>Best Rank:</strong> ".$ranks[ 'best' ]->textContent."<br/>"; $html .= "<strong>Worst Rank:</strong> ".$ranks[ 'worst' ]->textContent."<br/><br/>"; $html .= "</div>"; } $html .= "</div>"; return $html; } public function display( $id ) { print $this->body; } } ?>

Our style.css file

body{ background: #F7F7F7; } .domainContainer{ border-radius: 5px; border: 1px solid #CCCCCC; background: #EEEEEE; padding: 10px 20px; margin: 10px 0px; } .domainName{ background: orange; padding: 5px 10px; border-radius: 3px; font-weight: bold; } .domainRank{ margin-top: 10px; }

Comments

Add new comment

Type the characters you see in this picture. (verify using audio)
Type the characters you see in the picture above; if you can't read them, submit the form and a new image will be generated. Not case sensitive.