Scrap websites with nodejs!

August 27th, 2011

I really wanted to experiment a little bit with nodejs.

Target

So some days ago a friend of mine told me that he used a cool python library in order to scrap from a site the information he wanted to! So that sounded to me like a very good exercise!

Solution

The idea is to use jQuery powerful selectors to extract whatever I wanted to from an html page. If I can do that, I can also extract the url of the next page and the put the mechanism to work again for the that page! The code I finally had to write was about 50 lines!

scrapy.js

var jsdom = require( 'jsdom' ),
	//fs = require( 'fs' )
	//underscore = fs.readFileSync( './underscore.js' ).toString(),
	scrapy = function( conf, counter ) {
		var url = conf.url || null,
			getNextUrl = conf.getNextUrl || null,
			filterPage = conf.filterPage || null,
			finalDone = conf.done || null,
			counter = counter || 1,
			done = function() {
				counter--;
				if ( counter === 0 && finalDone !== null) {
					finalDone();
				}
			};

		jsdom.env({
			html: url,
			scripts: [ 'http://code.jquery.com/jquery-1.5.min.js' ],
			done: function( errors, window ) {
				if ( errors ) {
					console.log( errors );
				}
				var $ = window.$,
					nexturl;

				// First of all try to scrap the next url available
				if ( getNextUrl ) {
					nexturl = getNextUrl( $ );
					if ( nexturl ) {
						scrapy( { 
							url: nexturl, //getNextUrl( $ ), 
							getNextUrl: getNextUrl,
							filterPage: filterPage,
							done: done }, counter++	 );
					}
				}

				if ( filterPage ) {
					filterPage( $ );
				}
				else {
					console.log( 'No filterPage available' );
				}
				
				done();
			}
		});
		//console.log( 'Done with scrap of ' + url);
	};

exports.scrapy = scrapy;

The conf argument that scrapy function expects should have a url (the page we want to filter), a filterPage function that filters the page and a findNext function that filters the page to find the next url that should be scraped. If you want to you can also provide a done function which will get called only when all the scraps will finish! Here is an example:

run.js

var scrapy = require('./scrapy.js').scrapy,

	getAdFromTD = function( $td ) {
		var ad = {
				title: $td.find('a').html().replace(/\s*$/g, '')
			},
			type;

		$td = $td.next();
		ad.address = $td.find('a').html().replace( /<br>/g, ',' );

		$td = $td.next();
		type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''),
		ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g,  '' );
		ad.floor = type[1].replace( /.\s Floor/g, '' );
		ad.space = type[2].replace( /[\n|\s]+$/g, '' );

		$td = $td.next();
		type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, '');
		ad.type = type[0].replace( /[\n|\s]+$/g, '' );
		ad.build = type[1].replace( /[\n|\s]+$/g, '' );
		ad.price = type[2];

		return ad;
	},
	ads = []; 

scrapy( {
		url: "http://www.homegate.ch/rent/apartment-and-house/region-zuerich/matching-list?a=default&tab=list&l=default&cid=1585974&ao=&am=Z%C3%BCrich&ep=1&ac=1.5&ad=2.0&incsubs=default&tid=1&fromItem=ctn-zh&ag=1000&ah=2000&be=",
		getNextUrl: function( $ ) {
				return $('a.forward.iconLink').attr( 'href' );
			},
		filterPage: function( $ ) {
				$('#objectList tr')
					.each( function() {
						var $td = $( this ).find( '.tdTitle' ),
							ad;
				
						if ( $td.size() == 1) {
							ad = getAdFromTD( $td ); 
							ads.push( ad );
							console.log( ad );
						}
					});
				console.log( 'Found ' + ads.length);
			},
		done: function() {
				console.log( 'Done! found ' + ads.length + ' ads in total!');
			} 
		} );