A web spider is a long-running program that finds websites and record them. After that we can search the records generated by web spiders to find information we want. It’s the basic unit of a search engine.
First, we need to search for links from a base webpage and record the webpage to the database. Then, follow the links found and repeat the first step to discover more webpages.
Here are some modules that helps us coding a web spider:
- request (helps fetching webpages)
- cheerio (helps parsing webpages)
- mysql (for indexing pages)
- async (helper library)
sql.sql
-- phpMyAdmin SQL Dump -- version 3.5.8.1 -- http://www.phpmyadmin.net -- -- Host: 127.0.0.1 -- Generation Time: Jun 14, 2013 at 08:39 AM -- Server version: 5.6.11-log -- PHP Version: 5.4.14 SET SQL_MODE="NO_AUTO_VALUE_ON_ZERO"; SET time_zone = "+00:00"; /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; /*!40101 SET NAMES utf8 */; -- -- Database: `crawler` -- -- -------------------------------------------------------- -- -- Table structure for table `queue` -- CREATE TABLE IF NOT EXISTS `queue` ( `id` varchar(75) NOT NULL, `url` varchar(2000) NOT NULL, `from` varchar(2000) NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1; -- -------------------------------------------------------- -- -- Table structure for table `websites` -- CREATE TABLE IF NOT EXISTS `websites` ( `id` varchar(75) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, `url` varchar(2000) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, `from` varchar(2000) CHARACTER SET utf16 COLLATE utf16_unicode_ci NOT NULL, `title` varchar(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, `keywords` varchar(500) CHARACTER SET utf8 COLLATE utf8_unicode_ci NOT NULL, `desc` text CHARACTER SET utf32 COLLATE utf32_unicode_ci NOT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1; /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;config.js
{ "db":{ "host":"127.0.0.1", "port":3306, "user":"root", "password":"", "database":"crawler" }, "baseSite":"http://s3131212.com/links/" }
configLoader.js
var fs = require('fs'); var ConfigLoader = { _cache:null, load:function(file){ var self = this; this._cache = JSON.parse(fs.readFileSync(file).toString()); fs.watch(file,function(){ self._cache = JSON.parse(fs.readFileSync(file).toString()); }); return this._cache; }, get:function(key){ return this._cache[key]; } }; module.exports = ConfigLoader;
crawler.js
var request = require('request'); var cheerio = require('cheerio'); var async = require('async'); var db = require('mysql'); var util = require('./utils.js'); var config = require('./configLoader.js'); config.load(__dirname+'/config.json'); var Crawler = function(){ var self = this; this.conn = db.createConnection(config.get('db')); this.indexed = 0; this.baseSite = config.get('baseSite'); this._url = this.baseSite; this.url = this.baseSite; this.crawl = function(cb){ this.conn.query('SELECT * FROM `queue` LIMIT 0,1',function(e,result){ self.url = result.length > 0 ? result[0].url : self.baseSite; request(self.url,function(e,res,body){ if(result.length > 0){ self.conn.query('DELETE FROM `queue` WHERE `id` = ?',[result[0].id],function(){ cb(); }); } else { cb(); } if(!e && res.statusCode === 200){ self.getInfo(body,result.length > 0 ? result[0].from : ''); } else { console.log('Error requesting page %s',self.url); } self._url = self.url; }); }); }; this.getInfo = function(html,from){ var $ = cheerio.load(html); var title = $('head title').text(); var keywords = $('head meta[name=keywords]').attr('content'); var desc = $('head meta[name=description]').attr('content'); var links = $('a'); console.log('Crawling "%s" | %s',title,this.url); async.map(links.map(function(){ var href = $(this).attr('href'); if(href && href != self._url && !(/^#(\w)+/.test(href)) && !util.imageRegexp.test(href)){ if(util.isExternal(href)){ return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(href)+', `from` = '+self.conn.escape(from); } else { return 'INSERT INTO `queue` SET `id` = \''+util.id()+'\', `url` = '+self.conn.escape(util.resolveRelativeURL(href,self._url))+', `from` = '+self.conn.escape(from); } } return false; }).filter(function(el){ return !!el; }) ,this.conn.query.bind(this.conn),function(e,result){ if(e){ console.log('Error writing queue.'); console.log(e); } }); this.conn.query('INSERT INTO `websites` SET ?',{ id:util.id(), url:this.url, from:from, title:title, keywords:keywords || '', desc:desc || '' },function(e){ if(e){ console.log('Error indexing page %s',self.url); console.log(e); } else { console.log('Successfully indexed page %s',self.url); self.indexed++; } }); }; }; module.exports = Crawler;
main.js
var Crawler = require('./crawler.js'); var async = require('async'); var spider = new Crawler(); async.forever(function(cb){ spider.crawl(function(){ process.nextTick(function(){ cb(null); }); }); });utils.js
var path = require('path'); var Utils = { id:function(){ var ret = ''; for(var chars="0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_".split(''),i=0;i<75;i++){ ret += chars[~~(Math.random() * chars.length)]; } return ret; }, resolveRelativeURL:function(p,url){ var proto = url.match(/^https?/)[0]; //extract the protocol out url = url.replace(/^https?:\/\//,''); //remove the protocol return proto+'://'+path.normalize(url+'/'+p) //find out the absolute URL .replace(path.sep,'/') //replace blackslash with forward slash in Windows .replace(/#(\w)+/,''); //remove URL fragment }, isExternal:function(url){ return url.match(/^https?/) !== null; }, imageRegexp: new RegExp("("+['\\.png','\\.jpg','\\.gif','\\.bmp','\\.psd'].join('|')+")$","i") }; module.exports = Utils;Here’s how it works:
First, it crawls the base page specified in config.json first. Then it tries to find all links from the page content and add them to the queue, then the first item on the queue will be executed. This process continues again and again…
Also, we store the queue in a MySQL database for scalability. This script can be spawned across multiple machines.
Source URL: https://licson.net/post/create-a-simple-web-spider-in-node-js/
0 comments:
Post a Comment