binarylook

Ajax based applications breaks the crawlers as the content is generated on the browser. But there are techniques available to resolve the problem by generating content on server for crawlers by running the javascript on server-side. Node.js + jsdom can be used to support the crawler. Sample code

var http = require('http');

var jsdom = require("jsdom");

var request = require('request');

var xmlhttp = require("xmlhttprequest");

var sys = require('sys');

jsdom.defaultDocumentFeatures = {

FetchExternalResources : ['script'],

ProcessExternalResources : ['script'],

MutationEvents : '2.0',

QuerySelector : true

};

http.createServer(function (req, res) {

res.writeHead(200, {'Content-Type': 'text/html'});

downloadPage(res);

}).listen(1337, '127.0.0.1');

function downloadPage(res)

{

var url = "";

request({ uri:url }, function (error, response, body) {

var doc = jsdom.jsdom(body);

var window = doc.createWindow();

window.XMLHttpRequest = xmlhttp.XMLHttpRequest;

/*wait for ajax competion - function get called from the calling paging once it rendered the ajax content*/

window.completeajax = function(){

res.end(doc.innerHTML);

};

});

}

Note : Assume that the dependent node.js modules are installed

binarylook

Wednesday, March 13, 2013

AJAX Page SEO Crawler using Node.js + JSDOM