I really wanted to experiment a little bit with nodejs.
So some days ago a friend of mine told me that he used a cool python library in order to scrap from a site the information he wanted to! So that sounded to me like a very good exercise!
The idea is to use jQuery powerful selectors to extract whatever I wanted to from an html page. If I can do that, I can also extract the url of the next page and the put the mechanism to work again for the that page! The code I finally had to write was about 50 lines!
var jsdom = require( 'jsdom' ),
//fs = require( 'fs' )
//underscore = fs.readFileSync( './underscore.js' ).toString(),
scrapy = function( conf, counter ) {
var url = conf.url || null,
getNextUrl = conf.getNextUrl || null,
filterPage = conf.filterPage || null,
finalDone = conf.done || null,
counter = counter || 1,
done = function() {
counter--;
if ( counter === 0 && finalDone !== null) {
finalDone();
}
};
jsdom.env({
html: url,
scripts: [ 'http://code.jquery.com/jquery-1.5.min.js' ],
done: function( errors, window ) {
if ( errors ) {
console.log( errors );
}
var $ = window.$,
nexturl;
// First of all try to scrap the next url available
if ( getNextUrl ) {
nexturl = getNextUrl( $ );
if ( nexturl ) {
scrapy( {
url: nexturl, //getNextUrl( $ ),
getNextUrl: getNextUrl,
filterPage: filterPage,
done: done }, counter++ );
}
}
if ( filterPage ) {
filterPage( $ );
}
else {
console.log( 'No filterPage available' );
}
done();
}
});
//console.log( 'Done with scrap of ' + url);
};
exports.scrapy = scrapy;
The conf argument that scrapy function expects should have a url (the page we want to filter), a filterPage
function that filters the page and a findNext function that filters the page to find the next url that should be scraped. If you want to you can also provide a done function which will get called only when all the scraps will finish! Here is an example:
var scrapy = require('./scrapy.js').scrapy,
getAdFromTD = function( $td ) {
var ad = {
title: $td.find('a').html().replace(/\s*$/g, '')
},
type;
$td = $td.next();
ad.address = $td.find('a').html().replace( /<br>/g, ',' );
$td = $td.next();
type = $td.find('a').html().split( '<br>' ); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, ''),
ad.rooms = type[0].replace( /\sRooms[\n|\s]+/g, '' );
ad.floor = type[1].replace( /.\s Floor/g, '' );
ad.space = type[2].replace( /[\n|\s]+$/g, '' );
$td = $td.next();
type = $td.find('a').html().split('<br>'); //.replace(/<br>/g, ',').replace(/\n|\s{2,}/g, '');
ad.type = type[0].replace( /[\n|\s]+$/g, '' );
ad.build = type[1].replace( /[\n|\s]+$/g, '' );
ad.price = type[2];
return ad;
},
ads = [];
scrapy( {
url: "http://www.homegate.ch/rent/apartment-and-house/region-zuerich/matching-list?a=default&tab=list&l=default&cid=1585974&ao=&am=Z%C3%BCrich&ep=1&ac=1.5&ad=2.0&incsubs=default&tid=1&fromItem=ctn-zh&ag=1000&ah=2000&be=",
getNextUrl: function( $ ) {
return $('a.forward.iconLink').attr( 'href' );
},
filterPage: function( $ ) {
$('#objectList tr')
.each( function() {
var $td = $( this ).find( '.tdTitle' ),
ad;
if ( $td.size() == 1) {
ad = getAdFromTD( $td );
ads.push( ad );
console.log( ad );
}
});
console.log( 'Found ' + ads.length);
},
done: function() {
console.log( 'Done! found ' + ads.length + ' ads in total!');
}
} );