如何解决使用AWS texract Java API无法获得与AWS texract UI相同的键值数据
当我在AWS tesract UI上上传pdf时,我能够看到键值形式的所有数据,但是当我尝试使用AWS Textact java api无法获得键值形式的2页数据时。谁能帮我..!
我已经附加了UI屏幕截图,Java代码以从块,PDF文件中获取键和值。
谢谢。
static void CreatetopicandQueue() {
// create a new SNS topic
snsTopicName = "AmazonTextractTopic" + Long.toString(System.currentTimeMillis());
CreatetopicRequest createtopicRequest = new CreatetopicRequest(snsTopicName);
CreatetopicResult createtopicResult = sns.createtopic(createtopicRequest);
snsTopicArn = createtopicResult.getTopicArn();
// Create a new SQS Queue
sqsQueueName = "AmazonTextractQueue" + Long.toString(System.currentTimeMillis());
final CreateQueueRequest createQueueRequest = new CreateQueueRequest(sqsQueueName);
sqsQueueUrl = sqs.createQueue(createQueueRequest).getQueueUrl();
sqsQueueArn = sqs.getQueueAttributes(sqsQueueUrl,Arrays.asList("QueueArn")).getAttributes().get("QueueArn");
// Subscribe SQS queue to SNS topic
String sqsSubscriptionArn = sns.subscribe(snsTopicArn,"sqs",sqsQueueArn).getSubscriptionArn();
// Authorize queue
Policy policy = new Policy().withStatements(
new Statement(Effect.Allow).withPrincipals(Principal.AllUsers).withActions(SQSActions.SendMessage)
.withResources(new Resource(sqsQueueArn)).withConditions(new Condition().withType("ArnEquals")
.withConditionKey("aws:SourceArn").withValues(snsTopicArn)));
Map queueAttributes = new HashMap();
queueAttributes.put(QueueAttributeName.Policy.toString(),policy.toJson());
sqs.setQueueAttributes(new SetQueueAttributesRequest(sqsQueueUrl,queueAttributes));
System.out.println("Topic arn: " + snsTopicArn);
System.out.println("Queue arn: " + sqsQueueArn);
System.out.println("Queue url: " + sqsQueueUrl);
System.out.println("Queue sub arn: " + sqsSubscriptionArn);
}
static void DeletetopicandQueue() {
if (sqs != null) {
sqs.deleteQueue(sqsQueueUrl);
System.out.println("SQS queue deleted");
}
if (sns != null) {
sns.deletetopic(snsTopicArn);
System.out.println("SNS topic deleted");
}
}
// Starts the processing of the input document.
static void ProcessDocument(String inBucket,String inDocument,String inRoleArn,Processtype type,LCA lca)
throws Exception {
bucket = inBucket;
document = inDocument;
roleArn = inRoleArn;
StartDocumentAnalysis(bucket,document);
System.out.println("Processing type: Analysis");
System.out.println("Waiting for job: " + startJobId);
// Poll queue for messages
List<Message> messages = null;
int dotLine = 0;
boolean jobFound = false;
// loop until the job status is published. Ignore other messages in queue.
do {
messages = sqs.receiveMessage(sqsQueueUrl).getMessages();
if (dotLine++ < 40) {
System.out.print(".");
} else {
System.out.println();
dotLine = 0;
}
if (!messages.isEmpty()) {
// Loop through messages received.
for (Message message : messages) {
String notification = message.getBody();
// Get status and job id from notification.
ObjectMapper mapper = new ObjectMapper();
JsonNode jsonMessageTree = mapper.readTree(notification);
JsonNode messageBodyText = jsonMessageTree.get("Message");
ObjectMapper operationResultMapper = new ObjectMapper();
JsonNode jsonResultTree = operationResultMapper.readTree(messageBodyText.textValue());
JsonNode operationJobId = jsonResultTree.get("JobId");
JsonNode operationStatus = jsonResultTree.get("Status");
System.out.println("Job found was " + operationJobId);
// Found job. Get the results and display.
if (operationJobId.asText().equals(startJobId)) {
jobFound = true;
System.out.println("Job id: " + operationJobId);
System.out.println("Status : " + operationStatus.toString());
if (operationStatus.asText().equals("SUCCEEDED")) {
GetDocumentAnalysisResults(lca);
break;
} else {
System.out.println("Document analysis Failed");
}
sqs.deleteMessage(sqsQueueUrl,message.getReceiptHandle());
}
else {
System.out.println("Job received was not job " + startJobId);
// Delete unkNown message. Consider moving message to dead letter queue
sqs.deleteMessage(sqsQueueUrl,message.getReceiptHandle());
}
}
} else {
Thread.sleep(5000);
}
} while (!jobFound);
System.out.println("Finished processing document");
}
private static void StartDocumentAnalysis(String bucket,String document) throws Exception {
// Create notification channel
NotificationChannel channel = new NotificationChannel().withSNSTopicArn(snsTopicArn).withRoleArn(roleArn);
StartDocumentAnalysisRequest req = new StartDocumentAnalysisRequest().withFeatureTypes(FeatureType.FORMS)
.withDocumentLocation(
new DocumentLocation().withS3Object(new S3Object().withBucket(bucket).withName(document)))
.withJobTag("AnalyzingText").withNotificationChannel(channel);
StartDocumentAnalysisResult startDocumentAnalysisResult = textract.startDocumentAnalysis(req);
startJobId = startDocumentAnalysisResult.getJobId();
}
// Gets the results of processing started by StartDocumentAnalysis
private static void GetDocumentAnalysisResults(LCA lca) throws Exception {
int maxResults = 1000;
String paginationToken = null;
GetDocumentAnalysisResult response = null;
Boolean finished = false;
// loops until pagination token is null
while (finished == false) {
GetDocumentAnalysisRequest documentAnalysisRequest = new GetDocumentAnalysisRequest().withJobId(startJobId)
.withMaxResults(maxResults).withNextToken(paginationToken);
response = textract.getDocumentAnalysis(documentAnalysisRequest);
DocumentMetadata documentMetaData = response.getDocumentMetadata();
List<Block> blocks = response.getBlocks();
// get key and value maps
List<Block> key_map = new ArrayList<Block>();
List<Block> value_map = new ArrayList<Block>();
List<Block> block_map = new ArrayList<Block>();
for (Block block : blocks) {
block_map.add(block);
if (block.getBlockType().equals("KEY_VALUE_SET")) {
if (block.getEntityTypes().contains("KEY")) {
key_map.add(block);
} else {
value_map.add(block);
}
}
}
System.out.println("------------------------------");
getKVMapRelationship(key_map,value_map,block_map);
paginationToken = response.getNextToken();
if (paginationToken == null)
finished = true;
}
}
@NotNull
public static LinkedHashMap<String,String> getKVMapRelationship(List<Block> key_map,List<Block> value_map,List<Block> block_map) throws IOException {
LinkedHashMap<String,String> kvs = new LinkedHashMap();
Block value_block;
String key,val = "";
for (Block key_block : key_map) {
value_block = Find_value_block(key_block,value_map);
key = Get_text(key_block,block_map);
val = Get_text(value_block,block_map);
kvs.put(key,val);
}
return kvs;
}
@NotNull
public static Block Find_value_block(Block block,List<Block> value_map) {
Block value_block = new Block();
for (Relationship relationship : block.getRelationships()) {
if (relationship.getType().equals("VALUE")) {
for (String value_id : relationship.getIds()) {
for (Block value : value_map) {
if (value.getId().equals(value_id)) {
value_block = value;
}
}
}
}
}
return value_block;
}
// null
@NotNull
public static String Get_text(Block result,List<Block> block_map) throws IOException {
String text = "";
Block word2 = new Block();
try {
if (result != null && CommonUtil.isListNotNullAndNotEmpty(result.getRelationships())) {
for (Relationship relationship : result.getRelationships()) {
if (relationship.getType().equals("CHILD")) {
for (String id : relationship.getIds()) {
System.out.println(id);
Block word = (block_map.stream().filter(x -> x.getId().equals(id)).findFirst()
.orElse(word2));
if (word.getBlockType() != null) {
if (word.getBlockType().equals("WORD")) {
text += word.getText() + " ";
} else if (word.getBlockType().equals("SELECTION_ELEMENT")) {
if (word.getSelectionStatus().equals("SELECTED")) {
text += "X ";
}
}
} else {
System.out.println(relationship.getIds());
System.out.println(word);
}
}
}
}
}
} catch (Exception e) {
e.printstacktrace();
}
return text;
}
PDF档案 enter link description here
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。