微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

使用pdfreader从pdf获取信息

如何解决使用pdfreader从pdf获取信息

我已经成功使用以下https://www.npmjs.com/package/pdfreader#raw-pdf-reading库从pdf获取数据。如果pdf在浏览器中在线打开,则一切正常。但是现在我偶然发现了以下链接http://www.creperiet.nu/Homepage/Download-File/f/1219907/h/45405c646fdb41f8dd40188584afdfdf/Dagens+Weekly+Men,它是直接下载的。我可以使用该链接做什么,以使其在浏览器中“在线”打开而不是直接下载吗?还是我可以使用pdfreader做一些事情使其与这种类型的pdf一起使用?

module.exports.getAndUpdatePdfMenu = async function (url){
    var buffer = await bufferize(url);
    var lines = await readlines(buffer);
    lines = await JSON.parse(JSON.stringify(lines));
    console.log(lines); 
    return lines; 
}


async function bufferize(url) {
  var hn = url.substring(url.search("//") + 2);
  hn = hn.substring(0,hn.search("/"));
  var pt = url.substring(url.search("//") + 2);
  pt = pt.substring(pt.search("/"));
  const options = { hostname: hn,port: 443,path: pt,method: "GET",useHttps: url.startsWith("https") };
  return new Promise(function(resolve,reject) {
    var buff = new Buffer.alloc(0);
    if(options.useHttps){
      const req = https.request(options,res => {
        res.on("data",d => {
          buff = Buffer.concat([buff,d]);
        });
        res.on("end",() => {
          resolve(buff);
        });
      });
      req.on("error",e => {
        console.error("https request error: " + e);
      });
      req.end();
    }else{
      const req = http.request(options,e => {
        console.error("https request error: " + e);
      });
      req.end();
    }
   
  });
}

/*
if second param is set then a space ' ' inserted whenever text 
chunks are separated by more than xwidth 
this helps in situations where words appear separated but
this is because of x coords (there are no spaces between words) 

each page is a different array element
*/
async function readlines(buffer,xwidth) {
  return new Promise((resolve,reject) => {
    var pdftxt = new Array();
    var pg = 0;
    new pdfreader.PdfReader().parseBuffer(buffer,function(err,item) {
      if (err) console.log("pdf reader error: " + err);
      else if (!item) {
        pdftxt.forEach(function(a,idx) {
          pdftxt[idx].forEach(function(v,i) {
            pdftxt[idx][i].splice(1,2);
          });
        });
        resolve(pdftxt);
      } else if (item && item.page) {
        pg = item.page - 1;
        pdftxt[pg] = [];
      } else if (item.text) {
        var t = 0;
        var sp = "";
        pdftxt[pg].forEach(function(val,idx) {
          if (val[1] == item.y) {
            if (xwidth && item.x - val[2] > xwidth) {
              sp += " ";
            } else {
              sp = "";
            }
            pdftxt[pg][idx][0] += sp + item.text;
            t = 1;
          }
        });
        if (t == 0) {
          pdftxt[pg].push([item.text,item.y,item.x]);
        }
      }
    });
  });
}

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。