-
Notifications
You must be signed in to change notification settings - Fork 0
/
fetchHtml.js
76 lines (68 loc) · 1.64 KB
/
fetchHtml.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
var Rx = require('rx');
var iconv = require('iconv-lite');
var rxRequest = require('./rxRequest.js');
var cheerio = require('cheerio');
var _ = require('lodash');
function fetchContent(options) {
var source = Rx.Observable
.range(1, options.pageSize || 5)
.flatMap(function(pageCount) {
return rxRequest({
url: options.urlTpl.replace('{page}', pageCount),
timeout: options.timeout || 10000,
encoding: null,
});
})
.retry(3)
.map(function(res) {
var body = res.body;
var html;
if (options.gbk) {
html = iconv.decode(body, 'gbk').toString();
} else {
html = body.toString();
}
return {
url: res.url,
$: cheerio.load(html, {
decodeEntities: false
})
};
});
return source;
}
// options = {
// urlTpl: 'keyword={keyword}&pageno={pageno}',
// handleContent: function() {}
// src: [
// {
// keyword: 'html5',
// pageSize: 5,
// gbk: false,
// timeout: 5000
// },
// ]
// }
module.exports = function(options) {
var urlTpl = options.urlTpl;
var src = options.src;
var handleContent = options.handleContent;
var _src = _.chain(src)
.map(function(item) {
return _.assign({}, item, {
urlTpl: urlTpl.replace('{keyword}', item.keyword)
});
})
.map(function(options) {
return fetchContent(options);
})
.value();
var ret = Rx.Observable.merge.apply(null, _src)
.map(function(res) {
return handleContent(res);
})
.reduce(function(preVal, curItem) {
return preVal.concat(curItem);
}, []);
return ret;
};