e.g. to scrape Stackoverflow.com:
var headers = {
"accept-charset" : "ISO-8859-1,utf-8;q=0.7,*;q=0.3",
"accept-language" : "en-US,en;q=0.8",
"accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
//"accept-encoding" : "gzip,deflate",
};
request(options, function(error, response, body) {
if(error) {
console.log(error);
} else {
parseSearchResult(body, function() {
setTimeout(function() {
crawl(nextpage);
}, 5000);
});
}
});
function downloadQuestion(url1, next) {
console.error('===download=== ',url1);
var options = {
url: url1,
headers: headers
};
request(options, function(error, response, body) {
if(error) {
console.log(error);
} else {
$=cheerio.load(body)
$('code').each(function(i,element) {
console.log($(element).text());
});
next();
}
});
}
function parseSearchResult(content, next) {
$=cheerio.load(content)
async.eachSeries($('.result-link a'), function(element, cb) {
var href = $(element).attr('href');
console.log(href);
downloadQuestion(urlPrefix+href, function() {
console.error('waiting...');
setTimeout(function() {
cb();
}, wait_time+Math.random()*3000);
});
}, function(err){
console.log(err);
next();
});
}