基本爬虫代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
const http = require('http');
const url = 'http://www.imooc.com/learn/348';
http.get(url, function (res) {
var html = '';
res.on('data', function (data) {
html += data;
});
res.on('end', function () {
var courseData=filterCapters(html);
printCourseInfo(courseData);
});
}).on('error', function () {
console.log('wrong');
});

爬取慕课网的章节

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
const http = require('http');
const cheerio = require('cheerio');
const url = 'http://www.imooc.com/learn/348';

function filterCapters(html) {
var $ = cheerio.load(html);
var chapters = $('.chapter');
var courseData = [];
chapters.each(function (item) {
var chapter = $(this);
var chapterTitle = chapter.find('strong').text();
var videos = chapter.find('.video').children('li');
var chapterData = {
chapterTitle: chapterTitle,
videos: []
};
videos.each(function (item) {
var video = $(this).find('.J-media-item');
var videoTitle = video.text();
var id = video.attr('href').split('video/')[1];
chapterData.videos.push({
title: videoTitle,
id: id
});
});
courseData.push(chapterData);
});
return courseData;
}

function printCourseInfo(courseData) {
courseData.forEach(function (item) {
var chapterTitle = item.chapterTitle;
console.log(chapterTitle + '\n');
item.videos.forEach(function (video) {
console.log(' [' + video.id + ']' + video.title + '\n');
});
});
}

http.get(url, function (res) {
var html = '';
res.on('data', function (data) {
html += data;
});
res.on('end', function () {
var courseData=filterCapters(html);
printCourseInfo(courseData);
});
}).on('error', function () {
console.log('wrong');
});