Scraping webpages using zombie.js

The following example shows how you can scrape content from a webpage using Node.js and the Zombie.js module (an insanely fast, lightweight framework for testing client-side JavaScript code in a simulated environment).

We’re going to use Zombie.js to scape the SenchaCon 2013 sessions grid page, extract all the session and speaker information, and store it locally in a .json file.

First, create a new directory and create an app.js file for our new Node.js application.

Next, paste the following code into out app.js file:

var fs = require("fs");

// 3rd party modules.
var Browser = require("zombie");

getGridData("http://senchacon.com/session-schedule.php", function (data) {
    // Wrap the `data` array in an object.
    data = {"sessions": data};
    // Let's make the JSON data pretty.
    var jsonStr = JSON.stringify(data, null, "  ");
    // Write the JSON data to the file system.
    fs.writeFile("schedule.json", jsonStr);
});


/**
 * Extracts the speaker information from the specified page.
 *
 * @param {String} url The URL of the SenchaCon 2013 speakers page.
 * @param {Function} callback The function to call when the data is loaded and parsed.
 * @param {Object} callback.data The sessions array.
 */
function getGridData(url, callback) {
    Browser.visit(url, function (e, browser) {
        // Extract the text from the specified DOM object and strip out the specified prefix.
        var extractNode = function (obj, label) {
            var re = new RegExp("^" + label, "i");
            return browser.text(obj).replace(re, "").trim();
        };

        // Extracts the session title info from the DOM.
        var getTitle = function (obj) {
            return extractNode(obj, "Session Title:");
        };

        // Extracts the session description info from the DOM.
        var getDescription = function (obj) {
            return extractNode(obj, "Description:");
        };

        // Extracts the session speaker info from the DOM.
        var getSpeaker = function (obj) {
            return extractNode(obj, "Speaker:");
        };

        // Extracts the session room/location info from the DOM.
        var getLocation = function (obj) {
            return extractNode(obj, "Location:");
        };

        // Loop through each of the `fancybox` nodes from the DOM and get the
        // session id, title, description, speaker, and location.
        var nodes = browser.queryAll("a.fancybox").map(function (node) {
            var id = ("#" + node.href.split("#")[1]);
            var title = browser.text(node);
            var sessionDOM = browser.query(id);
            // Find all the "p" tags in the specified session DOM element.
            var pTags = browser.queryAll("p", sessionDOM);
            var description = getDescription(pTags[1]);
            var speaker = "";
            var location = "";

            switch (pTags.length) {
                case 3:
                    speaker = getSpeaker(pTags[2]);
                    break;
                case 4:
                    speaker = getSpeaker(pTags[2]);
                    location = getLocation(pTags[3]);
                    break;
            }

            return {
                "id": id,
                "title": title,
                "description": description,
                "speaker": speaker,
                "location": location
            };
        });

        callback(nodes);
    });
}

Finally, we need to install the Zombie.js module in our application directory using npm. Using the Terminal or command line. Open up a new Terminal window, navigate to the same directory as the app.js file we created earlier and type:

npm install zombie

On Mac OS X you will need to have XCode or the OSX GCC Installer. If you’re using Windows, you’ll need Cygwin, GCC, and Python. For detailed installation instructions, see the Zombie.js GitHub project page.

Once Zombie.js is installed, you can run node app from the Terminal and in a couple seconds you should have a schedule.json in the application directory which looks similar to the following output:

{
  "sessions": [
    {
      "id": "#Session_132B",
      "title": "SenchaCon 2013 Keynote",
      "description": "The main keynote kicks off the conference, bringing together the entire audience of attendees for the conference welcome, key Sencha updates, demos, and major announcements about the Sencha roadmap. You’ll hear from key Sencha executives and industry luminaries, as well as key community leaders and innovators.",
      "speaker": "",
      "location": ""
    },
    {
      "id": "#Session_232B",
      "title": "DJing with Sencha Touch",
      "description": "This session showcases how the evolving HTML5 Web Audio API can be leveraged with Sencha Touch in order to create DJ software with a native-like user experience. You will learn about the Web Audio API, its flexibility and how well it can coexist in the Sencha Touch ecosystem.",
      "speaker": "Stan Bershadskiy",
      "location": "Northern Hemisphere E-2"
    },
    {
      "id": "#Session_12",
      "title": "Neptune: The Inside Story on the Latest Ext JS Theme, and How to Make the Most of It",
      "description": "The recent Ext JS 4.2 release included Neptune, a brand new, fully modern and robust application theme. In this session, the engineer and UX designer from the team behind Neptune will explain the choices they made in building the theme, some of its intricacies, and most importantly, what it means for you and your apps.",
      "speaker": "Phil Guerrant/Sun Yu",
      "location": "Americas Seminar Room"
    }
    //, ...
  ]
}

NOTE: The scraped output does not contain the track information or date/time of the presentation, only the presentation title, description, speaker name, and room name.

Leave a Reply

Your email address will not be published.