如何解决限制tesseract OCR识别的字符数
The images I'm trying to mask look like this. 我使用 tesseract 作为库来遍历多个图像并识别它们上的数字,然后屏蔽这些数字。我在每个图像中有 12 位数字,我希望它在第 8 位数字之后停止识别/屏蔽,IE 只需要屏蔽 8 位数字,我尝试通过裁剪图像来硬编码它,因为它们中的大多数都是有点相似,但稍微偏离的会搞砸。有没有办法在第 8 位数字之后停止 tesseract 屏蔽这些图像?我正在使用 cv2.rectangle 来屏蔽图像。
PyTesseract.PyTesseract.tesseract_cmd = "D:/Tess/tesseract.exe"
for imgfilepathactual in glob.iglob('D:/dataset/allpdf/data/*.jpeg'):
imgfilepath2 = imgfilepathactual.split("/")[3]
imgfilepath1 = imgfilepath2.split('\\')[1]
imgfilepath = imgfilepath1.split(".")[0]
#print(filepath)
print(imgfilepath)
img = cv2.imread('D:/dataset/allpdf/data/' + imgfilepath + '.jpeg',cv2.IMREAD_GRAYSCALE) #if using with pdf conv
#print(str(img))
sobelX1 = cv2.sobel(img,cv2.CV_64F,1,ksize = 1)
sobelY1 = cv2.sobel(img,ksize = 1)
sobelX1 = np.uint8(np.absolute(sobelX1))
sobelY1 = np.uint8(np.absolute(sobelY1))
sobelCombined1 = cv2.bitwise_or(sobelX1,sobelY1)
blurred = cv2.blur(sobelX1,(3,3)) #for pdf->img
canny = cv2.Canny(blurred,5,250)
pts1 =np.argwhere(canny>0)
y11,x11 = pts1.min(axis=0)
y21,x21 = pts1.max(axis=0)
cropped = img[y11:y21,x11:x21]
#cv2.imwrite("cropped.png",cropped)
resizedimage = cv2.resize(cropped,(1080,720),interpolation=cv2.INTER_CUBIC) #actual
cv2.imwrite('resizedimage' + imgfilepath + '.jpeg',resizedimage)
img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
h1,w1,_= img.shape
resizedimage = Image.open('resizedimage' + imgfilepath + '.jpeg')
Box1 = (0,0.90*h1)
resizedimage = resizedimage.crop(Box1)
resizedimage.save('resizedimage' + imgfilepath + '.jpeg')
img = cv2.imread('resizedimage' + imgfilepath + '.jpeg')
h2,w2,_ = img.shape
print ((h2,w2),"reso")
croppedimg2 = Image.open('resizedimage' + imgfilepath + '.jpeg')
Box2 = (0,0.75*h2,0.48*w2,h2)
croppedimg2 = croppedimg2.crop(Box2)
croppedimg2.save('croppedimg2' + imgfilepath + '.jpeg')
aadharBoxes = PyTesseract.image_to_Boxes(croppedimg2,lang = "eng")#,config=' --psm 7 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyz'
#adharBoxes = aadharBoxes[0:8]
for b in aadharBoxes.splitlines():
b = b.split(' ')
high = []
for i in range(8):
b[i] = b[i].split(' ')
high.append(int(b[i][2]))
high.append(int(b[i][4]))
#b = b[0:8]
print(b)
#print(len(b[0]))
maskedImage = cv2.rectangle(img,(int(b[0][1]),np.min(high)),(int(b[7][3]),np.max(high)),(0,0),-1)
cv2.imwrite("maskedImage" + imgfilepath + ".jpeg",maskedImage)
pdf = img2pdf.convert("maskedImage"+ imgfilepath + ".jpeg")
file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf","wb")
file.write(pdf)
file.close()
#else:
#pdf = img2pdf.convert("unmaskedImage"+ resizedimage + ".jpeg")
#file = open("D:/dataset/allpdf/masked_files/masked" + imgfilepath + ".pdf","wb")
#file.write(pdf)
#file.close()
os.remove('resizedimage' + imgfilepath + '.jpeg')
#os.remove('maskedImage' + imgfilepath + '.jpeg')
#os.remove('croppedimg2' + imgfilepath + '.jpeg')
for filepathactual in glob.iglob('D:/dataset/allpdf/*.pdf'):
#print(filepathactual)
filepath2 = filepathactual.split("/")[2]
filepath1 = filepath2.split("\\")[1]
filepath = filepath1.split(".")[0]
print(filepath)
def convertPdf2img():
pages = convert_from_path(filepathactual,500) #converting pdf to img
for page in pages:
page.save('out' + filepath + '.jpg','JPEG')
convertPdf2img()
img = cv2.imread('out' + filepath + '.jpg',cv2.IMREAD_GRAYSCALE) #if using with pdf conv
sobelX1 = cv2.sobel(img,sobelY1)
blurred = cv2.blur(img,interpolation=cv2.INTER_CUBIC) #actual
cv2.imwrite('resizedimage' +filepath + '.jpeg',resizedimage)
img = cv2.imread('resizedimage' +filepath + '.jpeg')
h1,_= img.shape
resizedimage = Image.open('resizedimage' +filepath + '.jpeg')
Box1 = (0,0.90*h1)
resizedimage = resizedimage.crop(Box1)
resizedimage.save('resizedimage' +filepath + '.jpeg')
img = cv2.imread('resizedimage' +filepath + '.jpeg')
h2,"reso")
croppedimg2 = Image.open('resizedimage' +filepath + '.jpeg')
Box2 = (0,0.65*h2,0.6*w2,h2)
croppedimg2 = croppedimg2.crop(Box2)
croppedimg2.save('croppedimg2' + filepath + '.jpeg')
aadharBoxes = PyTesseract.image_to_Boxes(croppedimg2,lang = "eng")
for b in aadharBoxes.splitlines():
b = b.split(' ')
maskedImage = cv2.rectangle(img,(int(b[1]),h2 - int(b[2])),(int(b[3]),h2 - int(b[4])),-1)
#print(b,"coords")
cv2.imwrite("maskedImage" + filepath + ".jpeg",maskedImage)
pdf = img2pdf.convert("maskedImage"+ filepath + ".jpeg")
file = open("D:/dataset/allpdf/masked_files/masked" + filepath + ".pdf","wb")
file.write(pdf)
file.close()
#print(w,h)
os.remove('out' + filepath + '.jpg')
os.remove('resizedimage' + filepath + '.jpeg')
os.remove('maskedImage' + filepath + '.jpeg')
os.remove('croppedimg2' + filepath + '.jpeg')
解决方法
要选择前八个字符,范围后的字符将被屏蔽。
import pytesseract
import numpy as np
import cv2
img = cv2.imread('muTYX.jpg')
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_boxes(img,lang = 'eng',config='--psm 7 --oem 3')
text = text.split('\n')
high = []
for i in range(8):
text[i] = text[i].split(' ')
high.append(int(text[i][2]))
high.append(int(text[i][4]))
cv2.rectangle(img,(int(text[0][1]),np.min(high)),(int(text[7][3]),np.max(high)),(0,255,0),2)
cv2.imshow('nubmer',img)
cv2.waitKey(0)
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。