从大型结构化文本文件中提取信息

如何解决从大型结构化文本文件中提取信息

很好下面是一些建议，如果您喜欢，请告诉我：

import re
import pprint
import sys

class Despacho(object):
    """
    Class to parse each line, applying the regexp and storing the results
    for future use
    """
    #used a dict with the keys instead of functions.
    regexp = {
        ('processo', 
         'data', 
         'despacho'): re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'),
        ('titular',): re.compile(r'Tit.(.*)'),
        ('procurador',): re.compile(r'Procurador: (.*)'),
        ('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
        ('apresentacao',
         'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
        ('marca',): re.compile(r'marca: (.*)'),
        ('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
        ('complemento',): re.compile(r'\*(.*)'),
    }

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []


    def read(self, line):
        for attrs, pattern in Despacho.regexp.iteritems():
            m = pattern.match(line)
            if m:
                for groupn, attr in enumerate(attrs):
                    # special case complemento:
                    if attr == 'complemento':
                        self.complemento.append(m.group(groupn + 1))
                    else:
                        # set the attribute on the object
                        setattr(self, attr, m.group(groupn + 1))

    def __repr__(self):
        # defines object printed representation
        d = {}
        for attrs in self.regexp:
            for attr in attrs:
                d[attr] = getattr(self, attr, None)
        return pprint.pformat(d)

def process(rpi):
    """
    read data and process each group
    """
    #Useless line, since you're doing a for anyway
    #rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)

def main():
    arquivo = open('rm1972.txt') # file to process
    for desp in process(arquivo):
        print desp # can print directly here.
        print('-' * 20)
    return 0

if __name__ == '__main__':
    main()

解决方法

我需要读取一些大文件（从5万行到10万行），这些文件以空行分隔成组。每组以相同的样式“ No.999999999 dd / mm / yyyy
ZZZ”开始。这是一些示例数据。

No.813829461
09/16/1987 270 SUZANO PAPEL E CELULOSE SA（BR / BA）
CNPJ / CIC / N INPI：16404287000155
律师：MARCELLO DO NASCIMENTO

No.815326777
12/28/1989 351 Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA（BR /
RJ）
CNPJ / CIC / INPINº：34162651000108代表
：主格；类别：产品
品牌：TRIO TROPICAL产品/
服务：09.40
*根据2006年6月1日第123号决议的规定，2006年1月24日在RPI 1829中发布。
律师：WALDEMAR RODRIGUES PEDRA

No.900148764 2007年11月1日LD3
Tit.TIARA BOLSAS ECALÇADOSLTDA
律师：Marcia Ferreira Gomes
办公室：Marcas Marcantes e CN Ltda
根据157条的规定，没有形式要求令人满意，商标注册请求不存在的LPI
*符合正式要求的协议：810080140197

我写了一些相应地解析它的代码。有什么我需要改进的地方，以提高可读性或性能？这是我到目前为止的内容：

import re,pprint

class Despacho(object):
    """
    Class to parse each line,applying the regexp and storing the results
    for future use
    """
    regexp = {
        re.compile(r'No.([\d]{9})  ([\d]{2}/[\d]{2}/[\d]{4})  (.*)'): lambda self: self._processo,re.compile(r'Tit.(.*)'): lambda self: self._titular,re.compile(r'Procurador: (.*)'): lambda self: self._procurador,re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,re.compile(r'Marca: (.*)'): lambda self: self._marca,re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,re.compile(r'\*(.*)'): lambda self: self._complemento,}

    def __init__(self):
        """
        'complemento' is the only field that can be multiple in a single registry
        """
        self.complemento = []

    def _processo(self,matches):
        self.processo,self.data,self.despacho = matches.groups()

    def _titular(self,matches):
        self.titular = matches.group(1)

    def _procurador(self,matches):
        self.procurador = matches.group(1)

    def _documento(self,matches):
        self.documento = matches.group(1)

    def _apresentacao(self,matches):
        self.apresentacao,self.natureza = matches.groups()

    def _marca(self,matches):
        self.marca = matches.group(1)

    def _classe(self,matches):
        self.classe = matches.group(1)

    def _complemento(self,matches):
        self.complemento.append(matches.group(1))

    def read(self,line):
        for pattern in Despacho.regexp:
            m = pattern.match(line)
            if m:
                Despacho.regexp[pattern](self)(m)


def process(rpi):
    """
    read data and process each group
    """
    rpi = (line for line in rpi)
    group = False

    for line in rpi:
        if line.startswith('No.'):
            group = True
            d = Despacho()

        if not line.strip() and group: # empty line - end of block
            yield d
            group = False

        d.read(line)


arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
    pprint.pprint(desp.__dict__)
    print('--------------')