Scrap websites with nodejs!

August 27th, 2011

I really wanted to experiment a little bit with nodejs.


So some days ago a friend of mine told me that he used a cool python library in order to scrap from a site the information he wanted to! So that sounded to me like a very good exercise!


The idea is to use jQuery powerful selectors to extract whatever I wanted to from an html page. If I can do that, I can also extract the url of the next page and the put the mechanism to work again for the that page! The code I finally had to write was about 50 lines!

var jsdom = require( 'jsdom' ),
	//fs = require( 'fs' )
	//underscore = fs.readFileSync( './underscore.js' ).toString(),
	scrapy = function( conf, counter ) {
		var url = conf.url || null,
			getNextUrl = conf.getNextUrl || null,
			filterPage = conf.filterPage || null,
			finalDone = conf.done || null,
			counter = counter || 1,
			done = function() {
				if ( counter === 0 && finalDone !== null) {

			html: url,
			scripts: [ '' ],
			done: function( errors, window ) {
				if ( errors ) {
					console.log( errors );
				var $ = window.$,

				// First of all try to scrap the next url available
				if ( getNextUrl ) {
					nexturl = getNextUrl( $ );
					if ( nexturl ) {
						scrapy( { 
							url: nexturl, //getNextUrl( $ ), 
							getNextUrl: getNextUrl,
							filterPage: filterPage,
							done: done }, counter++	 );

				if ( filterPage ) {
					filterPage( $ );
				else {
					console.log( 'No filterPage available' );
		//console.log( 'Done with scrap of ' + url);

exports.scrapy = scrapy;

The conf argument that scrapy function expects should have a url (the page we want to filter), a filterPage function that filters the page and a findNext function that filters the page to find the next url that should be scraped. If you want to you can also provide a done function which will get called only when all the scraps will finish! Here is an example:

var scrapy = require('./scrapy.js').scrapy,

	getAdFromTD = function( $td ) {
		var ad = {
				title: $td.find('a').html().replace(/\s*$/g, '')

		$td = $;
		ad.address = $td.find('a').html().replace( /<br>/g, ',' );

		$td = $;
		type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''),
		ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g,  '' );
		ad.floor = type[1].replace( /.\s Floor/g, '' ); = type[2].replace( /[\n|\s]+$/g, '' );

		$td = $;
		type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, '');
		ad.type = type[0].replace( /[\n|\s]+$/g, '' ); = type[1].replace( /[\n|\s]+$/g, '' );
		ad.price = type[2];

		return ad;
	ads = []; 

scrapy( {
		url: "",
		getNextUrl: function( $ ) {
				return $('a.forward.iconLink').attr( 'href' );
		filterPage: function( $ ) {
				$('#objectList tr')
					.each( function() {
						var $td = $( this ).find( '.tdTitle' ),
						if ( $td.size() == 1) {
							ad = getAdFromTD( $td ); 
							ads.push( ad );
							console.log( ad );
				console.log( 'Found ' + ads.length);
		done: function() {
				console.log( 'Done! found ' + ads.length + ' ads in total!');
		} ); v3.14.1 © Georgios Valotasios - CSS inspired by Adam Wathan's blog