微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

Python Tika 错误:URLError: <urlopen error unknown url type: c>

如何解决Python Tika 错误:URLError: <urlopen error unknown url type: c>

我一直在使用很多 python tika 从一些 pdf 文件提取文本。突然 Tika 不再使用以下代码和类似代码

from tika import parser
document = parser.from_file("prova.pdf")['content'] 

from tika import parser
parser.from_file("C:/Users/Daniele/Desktop/progetto_tesi_magistrale/prova.pdf")['content']

每次我收到此错误时:

2021-02-23 10:57:36,244 [MainThread  ] [INFO ]  Retrieving C:\Program Files\tika-server-1.24.1.jar to C:\Users\Daniele\AppData\Local\Temp\tika-server.jar.
---------------------------------------------------------------------------
URLError                                  Traceback (most recent call last)
~\anaconda3\lib\site-packages\tika\tika.py in getRemoteJar(urlOrPath,destPath)
    797         try:
--> 798             urlretrieve(urlOrPath,destPath)
    799         except IOError:

~\anaconda3\lib\urllib\request.py in urlretrieve(url,filename,reporthook,data)
    246 
--> 247     with contextlib.closing(urlopen(url,data)) as fp:
    248         headers = fp.info()

~\anaconda3\lib\urllib\request.py in urlopen(url,data,timeout,cafile,capath,cadefault,context)
    221         opener = _opener
--> 222     return opener.open(url,timeout)
    223 

~\anaconda3\lib\urllib\request.py in open(self,fullurl,timeout)
    524         sys.audit('urllib.Request',req.full_url,req.data,req.headers,req.get_method())
--> 525         response = self._open(req,data)
    526 

~\anaconda3\lib\urllib\request.py in _open(self,req,data)
    546 
--> 547         return self._call_chain(self.handle_open,'unkNown',548                                 'unkNown_open',req)

~\anaconda3\lib\urllib\request.py in _call_chain(self,chain,kind,meth_name,*args)
    501             func = getattr(handler,meth_name)
--> 502             result = func(*args)
    503             if result is not None:

~\anaconda3\lib\urllib\request.py in unkNown_open(self,req)
   1420         type = req.type
-> 1421         raise URLError('unkNown url type: %s' % type)
   1422 

URLError: <urlopen error unkNown url type: c>

During handling of the above exception,another exception occurred:

URLError                                  Traceback (most recent call last)
<ipython-input-4-5aa5aa48deec> in <module>
      1 from tika import parser
      2 
----> 3 document = parser.from_file("prova.pdf")['content']
      4 #import tika
      5 #from tika import parser

~\anaconda3\lib\site-packages\tika\parser.py in from_file(filename,serverEndpoint,service,xmlContent,headers,config_path,requestOptions)
     38     '''
     39     if not xmlContent:
---> 40         output = parse1(service,headers=headers,config_path=config_path,requestOptions=requestOptions)
     41     else:
     42         output = parse1(service,services={'Meta': '/Meta','text': '/tika','all': '/rMeta/xml'},~\anaconda3\lib\site-packages\tika\tika.py in parse1(option,urlOrPath,verbose,tikaServerJar,responseMimeType,services,rawResponse,requestOptions)
    334     headers.update({'Accept': responseMimeType,'Content-disposition': make_content_disposition_header(path.encode('utf-8') if type(path) is unicode_string else path)})
    335     with urlOrPath if _is_file_object(urlOrPath) else open(path,'rb') as f:
--> 336         status,response = callServer('put',f,337                                       headers,338                                       rawResponse=rawResponse,requestOptions=requestOptions)

~\anaconda3\lib\site-packages\tika\tika.py in callServer(verb,httpVerbs,classpath,requestOptions)
    529     global TikaClientOnly
    530     if not TikaClientOnly:
--> 531         serverEndpoint = checkTikaServer(scheme,serverHost,port,config_path)
    532 
    533     serviceUrl  = serverEndpoint + service

~\anaconda3\lib\site-packages\tika\tika.py in checkTikaServer(scheme,config_path)
    590         if not alreadyRunning:
    591             if not os.path.isfile(jarPath) and urlp.scheme != '':
--> 592                 getRemoteJar(tikaServerJar,jarPath)
    593 
    594             if not checkJarSig(tikaServerJar,jarPath):

~\anaconda3\lib\site-packages\tika\tika.py in getRemoteJar(urlOrPath,destPath)
    806             if os.path.exists(destPath) and os.path.isfile(destPath):
    807                 os.remove(destPath)
--> 808             urlretrieve(urlOrPath,destPath)
    809 
    810         return (destPath,'remote')

~\anaconda3\lib\urllib\request.py in urlretrieve(url,data)
    245     url_type,path = _splittype(url)
    246 
--> 247     with contextlib.closing(urlopen(url,data)) as fp:
    248         headers = fp.info()
    249 

~\anaconda3\lib\urllib\request.py in urlopen(url,context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url,timeout)
    223 
    224 def install_opener(opener):

~\anaconda3\lib\urllib\request.py in open(self,timeout)
    523 
    524         sys.audit('urllib.Request',data)
    526 
    527         # post-process response

~\anaconda3\lib\urllib\request.py in _open(self,data)
    545             return result
    546 
--> 547         return self._call_chain(self.handle_open,req)
    549 

~\anaconda3\lib\urllib\request.py in _call_chain(self,*args)
    500         for handler in handlers:
    501             func = getattr(handler,meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\anaconda3\lib\urllib\request.py in unkNown_open(self,req)
   1419     def unkNown_open(self,req):
   1420         type = req.type
-> 1421         raise URLError('unkNown url type: %s' % type)
   1422 
   1423 def parse_keqv_list(l):

URLError: <urlopen error unkNown url type: c>

我试图卸载 tika python、tika 服务器、java、python ......基本上所有的东西。奇怪的是,我的第二台电脑突然出现了同样的问题。 有什么建议么 ?非常感谢。

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。