ruk·si

Node.js
Web Crawler

Updated at 2013-10-30 08:28
# Go to the directory of your project.
# Install Cheerio, web scraping library.
npm install cheerio
// squirrel.js
var http = require("http");

// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
  http.get(url, function(res) {
    var data = "";
    res.on('data', function (chunk) {
      data += chunk;
    });
    res.on("end", function() {
      callback(data);
    });
  }).on("error", function() {
    callback(null);
  });
}

var cheerio = require("cheerio");

var url1 = "http://www.dailymail.co.uk/news/article-2297585/Wild-squirrels-pose-charming-pictures-photographer-hides-nuts-miniature-props.html"
download(url1, function(data) {
  if (!data) {
    console.log("No data!");
    return;
  }
  var $ = cheerio.load(data);
  $("div.artSplitter > img.blkBorder").each(function(i, e) {
    // Log source of each image with blkBorder class in div with class
    // artSplitter.
    console.log( $(e).attr("src") );
  });
  console.log("done");
});

var url2 = "http://www.echojs.com/";
download(url2, function(data) {
  if (!data) {
    console.log("No data!");
    return;
  }
  var $ = cheerio.load(data);
  $("article").each(function(i, e) { // For each article on site...
    var link = $(e).find("h2>a"); // ... find links inside 2nd headings.
    var poster = $(e).find("username").text(); // ... find poster's name.
    console.log(poster+": ["+link.html()+"]("+link.attr("href")+")");
  });
});

Source