如何解决从大型结构化文本文件中提取信息
很好 下面是一些建议,如果您喜欢,请告诉我:
import re
import pprint
import sys
class Despacho(object):
"""
Class to parse each line, applying the regexp and storing the results
for future use
"""
#used a dict with the keys instead of functions.
regexp = {
('processo',
'data',
'despacho'): re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'),
('titular',): re.compile(r'Tit.(.*)'),
('procurador',): re.compile(r'Procurador: (.*)'),
('documento',): re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'),
('apresentacao',
'natureza'): re.compile(r'Apres.: (.*) ; Nat.: (.*)'),
('marca',): re.compile(r'marca: (.*)'),
('classe',): re.compile(r'Clas.Prod/Serv: (.*)'),
('complemento',): re.compile(r'\*(.*)'),
}
def __init__(self):
"""
'complemento' is the only field that can be multiple in a single registry
"""
self.complemento = []
def read(self, line):
for attrs, pattern in Despacho.regexp.iteritems():
m = pattern.match(line)
if m:
for groupn, attr in enumerate(attrs):
# special case complemento:
if attr == 'complemento':
self.complemento.append(m.group(groupn + 1))
else:
# set the attribute on the object
setattr(self, attr, m.group(groupn + 1))
def __repr__(self):
# defines object printed representation
d = {}
for attrs in self.regexp:
for attr in attrs:
d[attr] = getattr(self, attr, None)
return pprint.pformat(d)
def process(rpi):
"""
read data and process each group
"""
#Useless line, since you're doing a for anyway
#rpi = (line for line in rpi)
group = False
for line in rpi:
if line.startswith('No.'):
group = True
d = Despacho()
if not line.strip() and group: # empty line - end of block
yield d
group = False
d.read(line)
def main():
arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
print desp # can print directly here.
print('-' * 20)
return 0
if __name__ == '__main__':
main()
解决方法
我需要读取一些大文件(从5万行到10万行),这些文件以空行分隔成组。每组以相同的样式“ No.999999999 dd / mm / yyyy
ZZZ”开始。这是一些示例数据。
No.813829461
09/16/1987 270 SUZANO PAPEL E CELULOSE SA(BR / BA)
CNPJ / CIC / N INPI:16404287000155
律师:MARCELLO DO NASCIMENTONo.815326777
12/28/1989 351 Tit.SIGLA SISTEMA GLOBO DE GRAVACOES AUDIO VISUAIS LTDA(BR /
RJ)
CNPJ / CIC / INPINº:34162651000108代表
:主格; 类别:产品
品牌:TRIO TROPICAL产品/
服务:09.40
*根据2006年6月1日第123号决议的规定,2006年1月24日在RPI 1829中发布。
律师:WALDEMAR RODRIGUES PEDRANo.900148764 2007年11月1日LD3
Tit.TIARA BOLSAS ECALÇADOSLTDA
律师:Marcia Ferreira Gomes
办公室:Marcas Marcantes e CN Ltda
根据157条的规定,没有形式要求令人满意,商标注册请求不存在的LPI
*符合正式要求的协议:810080140197
我写了一些相应地解析它的代码。有什么我需要改进的地方,以提高可读性或性能?这是我到目前为止的内容:
import re,pprint
class Despacho(object):
"""
Class to parse each line,applying the regexp and storing the results
for future use
"""
regexp = {
re.compile(r'No.([\d]{9}) ([\d]{2}/[\d]{2}/[\d]{4}) (.*)'): lambda self: self._processo,re.compile(r'Tit.(.*)'): lambda self: self._titular,re.compile(r'Procurador: (.*)'): lambda self: self._procurador,re.compile(r'C.N.P.J./C.I.C./N INPI :(.*)'): lambda self: self._documento,re.compile(r'Apres.: (.*) ; Nat.: (.*)'): lambda self: self._apresentacao,re.compile(r'Marca: (.*)'): lambda self: self._marca,re.compile(r'Clas.Prod/Serv: (.*)'): lambda self: self._classe,re.compile(r'\*(.*)'): lambda self: self._complemento,}
def __init__(self):
"""
'complemento' is the only field that can be multiple in a single registry
"""
self.complemento = []
def _processo(self,matches):
self.processo,self.data,self.despacho = matches.groups()
def _titular(self,matches):
self.titular = matches.group(1)
def _procurador(self,matches):
self.procurador = matches.group(1)
def _documento(self,matches):
self.documento = matches.group(1)
def _apresentacao(self,matches):
self.apresentacao,self.natureza = matches.groups()
def _marca(self,matches):
self.marca = matches.group(1)
def _classe(self,matches):
self.classe = matches.group(1)
def _complemento(self,matches):
self.complemento.append(matches.group(1))
def read(self,line):
for pattern in Despacho.regexp:
m = pattern.match(line)
if m:
Despacho.regexp[pattern](self)(m)
def process(rpi):
"""
read data and process each group
"""
rpi = (line for line in rpi)
group = False
for line in rpi:
if line.startswith('No.'):
group = True
d = Despacho()
if not line.strip() and group: # empty line - end of block
yield d
group = False
d.read(line)
arquivo = open('rm1972.txt') # file to process
for desp in process(arquivo):
pprint.pprint(desp.__dict__)
print('--------------')
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。