Scrap websites with nodejs!

Γιώργος Βαλοτάσιος  27/08/2011

I really wanted to experiment a little bit with nodejs.

Target

So some days ago a friend of mine told me that he used a cool python library in order to scrap from a site the information he wanted to! So that sounded to me like a very good exercise!

Solution

The idea is to use jQuery powerful selectors to extract whatever I wanted to from an html page. If I can do that, I can also extract the url of the next page and the put the mechanism to work again for the that page! The code I finally had to write was about 50 lines!

var jsdom = require( 'jsdom' ),
  //fs = require( 'fs' )
  //underscore = fs.readFileSync( './underscore.js' ).toString(),
  scrapy = function( conf, counter ) {
    var url = conf.url || null,
      getNextUrl = conf.getNextUrl || null,
      filterPage = conf.filterPage || null,
      finalDone = conf.done || null,
      counter = counter || 1,
      done = function() {
        counter--;
        if ( counter === 0 && finalDone !== null) {
          finalDone();
        }
      };

    jsdom.env({
      html: url,
      scripts: [ 'http://code.jquery.com/jquery-1.5.min.js' ],
      done: function( errors, window ) {
        if ( errors ) {
          console.log( errors );
        }
        var $ = window.$,
          nexturl;

        // First of all try to scrap the next url available
        if ( getNextUrl ) {
          nexturl = getNextUrl( $ );
          if ( nexturl ) {
            scrapy( { 
              url: nexturl, //getNextUrl( $ ), 
              getNextUrl: getNextUrl,
              filterPage: filterPage,
              done: done }, counter++   );
          }
        }

        if ( filterPage ) {
          filterPage( $ );
        }
        else {
          console.log( 'No filterPage available' );
        }
        
        done();
      }
    });
    //console.log( 'Done with scrap of ' + url);
  };

exports.scrapy = scrapy;

The conf argument that scrapy function expects should have a url (the page we want to filter), a filterPage function that filters the page and a findNext function that filters the page to find the next url that should be scraped. If you want to you can also provide a done function which will get called only when all the scraps will finish! Here is an example:

var scrapy = require('./scrapy.js').scrapy,

  getAdFromTD = function( $td ) {
    var ad = {
        title: $td.find('a').html().replace(/\s*$/g, '')
      },
      type;

    $td = $td.next();
    ad.address = $td.find('a').html().replace( /<br>/g, ',' );

    $td = $td.next();
    type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''),
    ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g,  '' );
    ad.floor = type[1].replace( /.\s Floor/g, '' );
    ad.space = type[2].replace( /[\n|\s]+$/g, '' );

    $td = $td.next();
    type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, '');
    ad.type = type[0].replace( /[\n|\s]+$/g, '' );
    ad.build = type[1].replace( /[\n|\s]+$/g, '' );
    ad.price = type[2];

    return ad;
  },
  ads = []; 

scrapy( {
    url: "http://www.homegate.ch/rent/apartment-and-house/region-zuerich/matching-list?a=default&tab=list&l=default&cid=1585974&ao=&am=Z%C3%BCrich&ep=1&ac=1.5&ad=2.0&incsubs=default&tid=1&fromItem=ctn-zh&ag=1000&ah=2000&be=",
    getNextUrl: function( $ ) {
        return $('a.forward.iconLink').attr( 'href' );
      },
    filterPage: function( $ ) {
        $('#objectList tr')
          .each( function() {
            var $td = $( this ).find( '.tdTitle' ),
              ad;
        
            if ( $td.size() == 1) {
              ad = getAdFromTD( $td ); 
              ads.push( ad );
              console.log( ad );
            }
          });
        console.log( 'Found ' + ads.length);
      },
    done: function() {
        console.log( 'Done! found ' + ads.length + ' ads in total!');
      } 
    } );