微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

将pathos与XML结合使用时出现酸洗错误

如何解决将pathos与XML结合使用时出现酸洗错误

我正在尝试将the multistream Wikipedia dump读入数据库。这是我尝试并行加载较小的块。这是脚本:

#!/usr/bin/python3
import xml.sax
from bz2 import BZ2File
import mwparserfromhell
import psycopg2
import pathos
import os
import dill


class XmlHandler(xml.sax.handler.ContentHandler):
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def get_pages(self):
        return self._pages

    def get_page_count(self):
        return len(self._pages)

    def get_values(self):
        return self._values

    def characters(self,content):
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self,name,attrs):
        if name in ('title','text','infoBox'):
            self._current_tag = name
            self._buffer = []

    def endElement(self,name):
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self.process_article()

    def process_article(self):
        wikicode = mwparserfromhell.parse(self._values['text'])
        infoBox_array = wikicode.filter_templates(matches="infoBox .*")
        infoBox = str(infoBox_array[0]) if len(infoBox_array) > 0 else ""
        self._pages.append((self._values['title'],self._values['text'],infoBox))


def load_xml(filename):
    wiki_handler = XmlHandler()
    wiki_parser = xml.sax.make_parser()
    wiki_parser.setContentHandler(wiki_handler)

    file = os.path.join("chunks",filename)
    print("I'm a worker process")
    cursor = conn.cursor()

    with BZ2File(file,'r') as f:
        for line in f:
            wiki_parser.Feed(line)

        pages = wiki_handler.get_pages()

    for page in pages:
        cursor.execute("INSERT INTO pages (title,text,infoBox) VALUES (%s,%s,%s) ON CONFLICT DO nothing",page)

    cursor.close()
    print("all done")


if __name__ == "__main__":
    conn = psycopg2.connect(dbname="wikipedia",user="postgres",password="postgres",host="localhost",port=5432)

    file_list = [f for f in os.listdir("chunks") if os.path.isfile(os.path.join("chunks",f))]
    pool = pathos.multiprocessing.ProcessingPool(processes=pathos.multiprocessing.cpu_count())
    pool.map(load_xml,file_list)

和追溯:

Traceback (most recent call last):
  File "./loader_parallel.py",line 114,in <module>
    pool.map(load_xml,file_list)
  File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py",line 268,in map
    return self._map_async(func,iterable,mapstar,chunksize).get()
  File "/home/smh/.local/lib/python3.7/site-packages/multiprocess/pool.py",line 657,in get
    raise self._value
multiprocess.pool.MaybeEncodingError: Error sending result: 
'<multiprocess.pool.ExceptionWithTraceback object at 0x7f87ac0f4470>'. 
Reason: 'TypeError("can't pickle pyexpat.xmlparser objects")'

为什么不能腌制pyexpat.xmlparser对象?我该如何解决?我尝试通过运行dill.copy(XmlHandler())对其进行测试,并且这样做没有错误

我通过Debian 10上运行Python 3.7的pip3安装了pathos,对此还很陌生,感谢您的帮助!

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。