微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

nodejs通过phantomjs实现下载网页

功能其实很见简单,通过 phantomjs.exe 采集 url 加载的资源,通过子进程的方式,启动nodejs 加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源

当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下

首先当然是下载 nodejs 和 phantomjs

下面是 phantomjs.exe 执行的 down.js

rush:js;"> var page = require('webpage').create(),system = require('system'); var spawn = require("child_process").spawn

if (system.args.length === 1) {
console.log('Usage: netsniff.js ');
phantom.exit(1);
} else {
var urls = [];
page.address = system.args[1];
page.onResourceReceived = function (res) {
if (res.stage === 'start') {
urls.push(res.url);
}
};
page.open(page.address,function (status) {
var har;
if (status !== 'success') {
console.log('FAIL to load the address');
phantom.exit(1);
} else {
console.log('down resource ' + urls.length + ' urls.');
var child = spawn("node",["--harmony","downHtml.js",urls.join(',')])
child.stdout.on("data",function (data) {
console.log(data);
})
child.stderr.on("data",function (data) {
console.log(data);
})
child.on("exit",function (code) {
phantom.exit();
})
}
});
}

下面是对应的node运行的 downHtml.js

rush:js;"> "use strict"; var fs = require('fs'); var http = require('http'); var path = require('path'); var r_url = require('url');

var dirCache = {};//缓存减少判断
function makedir (pathStr,callback) {
if (dirCache[pathStr] == 1) {
callback();
} else {
fs.exists(pathStr,function (exists) {
if (exists == true) {
dirCache[pathStr] == 1;
callback();
} else {
makedir(path.dirname(pathStr),function () {
fs.mkdir(pathStr,function () {
dirCache[pathStr] == 1;
callback();
})
});
}
})
}
};

var reg = /[:,]\surl(['"]?.?(\1))/g
var reg2 = /((['"]?)(.*?)(\1))/
var isDownMap = {};
var downImgFromCss = function (URL) {
http.get(URL,function(res) {
//console.log(path.resolve(process.cwd(),'index.min.css'))
//res.pipe(fs.createWriteStream(path.resolve(process.cwd(),'index.min.css')));
var body = "";
res.setEncoding('utf8');
res.on('data',function (chunk) {
body += chunk;
});
res.on('end',function () {
var match = body.match(reg);
for (var i = 0,len = match.length; i < len; i++){
var m = match[i].match(reg2);
if (m && m[2]) {
var url = m[2];
let imgurl = r_url.resolve(URL,url);
if (!isDownMap[imgurl]) {
var uo = r_url.parse(imgurl);
let filepath = CWD + '/' + uo.hostname + uo.pathname;
makedir(path.dirname(filepath),function () {
http.get(imgurl,function (res) {
res.pipe(fs.createWriteStream(filepath));
})
})
isDownMap[imgurl] = 1;
}
}
}
});
});
}

var URLS = process.argv[2].split(',');
var CWD = process.cwd();
//下载资源
URLS.forEach(function (URL) {
var uo = r_url.parse(URL);
var filepath;
if (uo.pathname == '/' || uo.pathname == '') {
filepath = CWD + '/' + uo.hostname + '/index.html';
} else {
filepath = CWD + '/' + uo.hostname + uo.pathname;
}
makedir(path.dirname(filepath),function () {
http.get(URL,function (res) {
if (URL.indexOf('.css') != -1 || (res.headers["content-type"] && res.headers["content-type"].indexOf('text/css')!= -1)) {
console.log('down images form css file:' + URL + '.');
downImgFromCss(URL);
}
res.pipe(fs.createWriteStream(filepath));
})
});
});

down.js downHtml.js 放在同一个文件夹下 通过下列 cmd 运行

D:\phantomjs-2.0.0-windows\bin\phantomjs.exe down.js

以上所述就是本文的全部内容了,希望大家能够喜欢。

原文地址:https://www.jb51.cc/nodejs/54522.html

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐