如何解决使用pdfreader从pdf获取信息
我已经成功使用以下https://www.npmjs.com/package/pdfreader#raw-pdf-reading库从pdf获取数据。如果pdf在浏览器中在线打开,则一切正常。但是现在我偶然发现了以下链接:http://www.creperiet.nu/Homepage/Download-File/f/1219907/h/45405c646fdb41f8dd40188584afdfdf/Dagens+Weekly+Men,它是直接下载的。我可以使用该链接做什么,以使其在浏览器中“在线”打开而不是直接下载吗?还是我可以使用pdfreader做一些事情使其与这种类型的pdf一起使用?
module.exports.getAndUpdatePdfMenu = async function (url){
var buffer = await bufferize(url);
var lines = await readlines(buffer);
lines = await JSON.parse(JSON.stringify(lines));
console.log(lines);
return lines;
}
async function bufferize(url) {
var hn = url.substring(url.search("//") + 2);
hn = hn.substring(0,hn.search("/"));
var pt = url.substring(url.search("//") + 2);
pt = pt.substring(pt.search("/"));
const options = { hostname: hn,port: 443,path: pt,method: "GET",useHttps: url.startsWith("https") };
return new Promise(function(resolve,reject) {
var buff = new Buffer.alloc(0);
if(options.useHttps){
const req = https.request(options,res => {
res.on("data",d => {
buff = Buffer.concat([buff,d]);
});
res.on("end",() => {
resolve(buff);
});
});
req.on("error",e => {
console.error("https request error: " + e);
});
req.end();
}else{
const req = http.request(options,e => {
console.error("https request error: " + e);
});
req.end();
}
});
}
/*
if second param is set then a space ' ' inserted whenever text
chunks are separated by more than xwidth
this helps in situations where words appear separated but
this is because of x coords (there are no spaces between words)
each page is a different array element
*/
async function readlines(buffer,xwidth) {
return new Promise((resolve,reject) => {
var pdftxt = new Array();
var pg = 0;
new pdfreader.PdfReader().parseBuffer(buffer,function(err,item) {
if (err) console.log("pdf reader error: " + err);
else if (!item) {
pdftxt.forEach(function(a,idx) {
pdftxt[idx].forEach(function(v,i) {
pdftxt[idx][i].splice(1,2);
});
});
resolve(pdftxt);
} else if (item && item.page) {
pg = item.page - 1;
pdftxt[pg] = [];
} else if (item.text) {
var t = 0;
var sp = "";
pdftxt[pg].forEach(function(val,idx) {
if (val[1] == item.y) {
if (xwidth && item.x - val[2] > xwidth) {
sp += " ";
} else {
sp = "";
}
pdftxt[pg][idx][0] += sp + item.text;
t = 1;
}
});
if (t == 0) {
pdftxt[pg].push([item.text,item.y,item.x]);
}
}
});
});
}
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。