Earthify

Earthify takes a page of Craigslist posts and maps them on Google Earth. It has been tested with both search result pages and browse pages in several categories. The results are divided into “Earthifyable Listings” and “Un-Earthifyable Listings”, the un-earthifyable ones being those that could not be located based on the location provided by the user. In my tests, the listings are Earthifyable more often than not, but it really depends on how much information is in the posting.


/**
*	Represents one Craigslist page.
*	Can either be a page of results or a single listing page.
*/
class Craigslist extends PlacemarkSource {

private $url;

private $type;

const url_pattern = "/craigslist/i";

private $regexes = array(
/**
*	This should match the URL of any single Craigslist listing page.
*	They will look like
*	http://montreal.craigslist.org/sub/298262212.html
*	http://newyork.craigslist.org/brk/sys/299139631.html
*/
'single_listing_url' => "/craigslist\.[a-z]{2,3}\/(.+).html/i",

/**
*	This should match the URL of any page with multiple Craigslist pages
*	These pages may look like :
*	http://newyork.craigslist.org/search/bik/brk?query=&minAsk=min&maxAsk=200&hasPic=1
*	OR
*	http://montreal.craigslist.org/sub/
*	http://newyork.craigslist.org/que/bik/
*	BUT NOT
*	http://montreal.craigslist.org/sub/298262212.html
*	http://newyork.craigslist.org/brk/sys/299139631.html
*/
'multiple_listing_page_url' => "/craigslist\.[a-z]{2,3}/i",

/**
*	This should match the URL of any Craigslist listing feed.
*	These pages should looke like:
*	http://philadelphia.craigslist.org/apa/index.rss
*	http://philadelphia.craigslist.org/search/apa?query=&minAsk=&maxAsk=200&bedrooms=&addTwo=purrr&addThree=wooof&hasPic=1&format=rss
*/
'feed_url' => "/craigslist\.[a-z]{2,3}\/(.+)(index\.|format=)?(rss)/i",

/**
*	This Regex should match 1) URLs of listing pages 2) Titles of listings
*/
'get_listings' => "/<p>(.*?)<a href=\"(.*?)\">(.*?)<\/a><font size=\"-1\"> \((.*?)\)<\/font>(.*?)?<\/p>/i",

/**
*	In the pages of most Craigslist listings, the city that you are exploring is in a link at the top.
*	<a href="http://philadelphia.craigslist.org">philadelphia craigslist</a>
*	or
*	<a href="/"> new york craigslist</a>
*/
'city_in_title' => "/<a href=\"(.+)?\">(.*?) craigslist<\/a>/i"
);

/*
*	The constructor for a Craigslist listing.
*	Most times, if you are parsing an entire page of listings, you will know the city
*	for every posting and the title along with the url before you construct them.  So if you
*	provide them, the constructor will go faster.
*/
function Craigslist($_url, $limit=null) {

$this->url = $_url;
$this->limit = $limit;

// First we will figure out what kind of page we are dealing with.
if(preg_match($this->regexes['single_listing_url'], $this->url)) {

$this->type = "single"; // an actual listing page.

} else if(preg_match($this->regexes['feed_url'], $this->url)) {

$this->type = "feed";	// A feed of listings

} else if(preg_match($this->regexes['multiple_listing_page_url'], $this->url)) {

$this->type = "multiple";  //  a search results or browse page

}
}

/**
*	Takes a relative URL from the current page and turns it into a full URL.
*/
function unrelativize($relative_url) {
$u = parse_url($this->url);
if(substr($relative_url, 0, 1) == '/') {
return "http://".$u['host'].$relative_url;
} else {
return "http://".$u['host'].dirname($u['path']).$relative_url;
}
}

/*
function parsePlacemarks() {
foreach($this->placemarks as $p) {
$p->parse();
}
}
*/

function getPlacemarks() {

switch($this->type) {

// It is a single listing, so just parse the current page.
case 'single':
$this->placemarks[] = new CraigsListing($this->url);
break;

// If if is a page of many listings, get all of the URLs.
case 'multiple':
$content = HttpClient::quickGet($this->url);

// The city appears in a link at the top.
preg_match($this->regexes['city_in_title'], $content, $matches);
$city = $matches[2];

//assert('$city!=null');

preg_match_all($this->regexes['get_listings'], $content, $matches);
$listing_urls = $matches[2];
$titles = $matches[3];
$locations = $matches[4];

foreach(array_keys($listing_urls) as $i) {
$listing_urls[$i] = $this->unrelativize($listing_urls[$i]);
}

if(count($listing_urls) == 0) {
trigger_error("Earthify couldn't find any listings on the page you provided.");
}

// Loop through all of the matches and make placemarks from them.
$max = isset($this->limit)
? min(count($listing_urls), $this->limit)
: count($listing_urls);

for($i=0; $i<$max; $i++) {
// Check to make sure the link exists first
if(HttpClient::url_exists($listing_urls[$i])) {
$listing = new CraigsListing($listing_urls[$i]);
$listing->setTitle($titles[$i]);
$listing->setCity($city);
$this->placemarks[] = $listing;
} else {
trigger_error("{$listing_urls[$i]} returned a 200 or 302 status, which means that it doesn't exist.", E_USER_WARNING);
}
}
break;

// If this is the RSS feed from a particular kind Craigslist page, just parse the RSS
case 'feed':
$content = HttpClient::quickGet($this->url);

$xml = new SimpleXMLElement($content);

// The city name appears in the channel title.
preg_match("/<title>craigslist \| (.*?) in (.+)<\/title>/", $content, $matches);
$city = $matches[2];

//assert('$city!=null');

// Loop through all of the matches and make placemarks from them.
$max = isset($this->limit)
? min(count($xml->item), $this->limit)
: count($xml->item);

for($i=0; $i<$max; $i++) {

$link =  (string)$xml->item[$i]->link;

// Check to make sure the URL exists first.
if(HttpClient::url_exists($link)) {
$listing = new CraigsListing($link);
$listing->title = (string)$xml->item[$i]->title;
$listing->city = $city;
$this->placemarks[] = $listing;
} else {
trigger_error("{$listing_urls[$i]} returned a 200 or 302 status, which means that it doesn't exist.", E_USER_WARNING);
}
}
break;
}

return $this->placemarks;
} // end getPlacemarks()
}
/**************************
*
*	Serializer.php
*	Turns an array of Placemark objects into an Earthify KML script.
*
*	Written by Jeff Crouse at Eyebeam
*	February 8, 2007
*
***************************/
class KMLSerializer {

public static function serialize($placemarks) {

// Create the XML document and the root tag (kml)
$dom = new DomDocument('1.0', 'utf-8');
$root = $dom->createElement("kml");
$dom->appendChild($root);

// Create the Document tag
$doc = $dom->createElement("Document");
$root->appendChild($doc);

// Add a Earthify tag to the document tag.
$doc->appendChild($dom->createElement("name", "Earthify"));

// Make two folders within the document.  One for locatable items, one for lost ones
$found = $dom->createElement('Folder');
$found->appendChild($dom->createElement("name", "Earthifyable Listings"));
$doc->appendChild($found);

$lost = $dom->createElement('Folder');
$lost->appendChild($dom->createElement("name", "Un-Earthifyable Listings"));
$lost->appendChild($dom->createElement("description", "Items whose coordinates could not be geocoded."));
$doc->appendChild($lost);

// Loop through all of the listings and add them to the document.
foreach($placemarks as $placemark) {
if($placemark instanceof Placemark) {

// Make the placemark tag in which all of this shit will go.
$placemark_dom = $dom->createElement("Placemark");
$placemark_dom->setAttribute("id", $placemark->getId());

// Make the name tag for the Placemark
$name = $dom->createElement("name", htmlspecialchars($placemark->getTitle()));
$placemark_dom->appendChild($name);

// Make the description tag for the placemark.
$description = $dom->createElement("description");
$cdata = $dom->createCDATASection($placemark->description());
$description->appendChild( $cdata );
$placemark_dom->appendChild($description);

// If we can't find any coordinates, add this item to the "lost" folder.
if($placemark->getCoords()) {
// Make the Point element for the placemark
$point = $dom->createElement("Point");
$coords = $dom->createElement("coordinates", $placemark->getCoords());
$point->appendChild($coords);
$placemark_dom->appendChild($point);

// Add the placemark to the "Found" folder
$found->appendChild($placemark_dom);
} else {
$lost->appendChild($placemark_dom);

}
} else {
trigger_error("KMLSerializer received something that wasn't a Placemark.", E_WARNING);
}
}

return $dom->saveXML();
}
Share and Enjoy:
  • Print
  • Digg
  • del.icio.us
  • Facebook
  • Mixx
  • Google Bookmarks
  • MySpace
  • Netvibes
  • Reddit
  • Slashdot
  • StumbleUpon
  • Technorati
This entry was posted in Events, Projects and tagged , , , , , , , , , . Bookmark the permalink. Post a comment or leave a trackback: Trackback URL.

Post a Comment

Your email is never published nor shared. Required fields are marked *

*
*

You may use these HTML tags and attributes: <a href="" title=""> <abbr title=""> <acronym title=""> <b> <blockquote cite=""> <cite> <code> <del datetime=""> <em> <i> <q cite=""> <strike> <strong>