Node.js - Web Crawler
Updated at 2013-10-30 08:28
# Go to the directory of your project.
# Install Cheerio, web scraping library.
npm install cheerio
// squirrel.js
var http = require("http");
// Utility function that downloads a URL and invokes
// callback with the data.
function download(url, callback) {
http.get(url, function(res) {
var data = "";
res.on('data', function (chunk) {
data += chunk;
});
res.on("end", function() {
callback(data);
});
}).on("error", function() {
callback(null);
});
}
var cheerio = require("cheerio");
var url1 = "http://www.dailymail.co.uk/news/article-2297585/Wild-squirrels-pose-charming-pictures-photographer-hides-nuts-miniature-props.html"
download(url1, function(data) {
if (!data) {
console.log("No data!");
return;
}
var $ = cheerio.load(data);
$("div.artSplitter > img.blkBorder").each(function(i, e) {
// Log source of each image with blkBorder class in div with class
// artSplitter.
console.log( $(e).attr("src") );
});
console.log("done");
});
var url2 = "http://www.echojs.com/";
download(url2, function(data) {
if (!data) {
console.log("No data!");
return;
}
var $ = cheerio.load(data);
$("article").each(function(i, e) { // For each article on site...
var link = $(e).find("h2>a"); // ... find links inside 2nd headings.
var poster = $(e).find("username").text(); // ... find poster's name.
console.log(poster+": ["+link.html()+"]("+link.attr("href")+")");
});
});