微信公众号搜"智元新知"关注
微信扫一扫可直接关注哦!

使用AWS texract Java API无法获得与AWS texract UI相同的键值数据

如何解决使用AWS texract Java API无法获得与AWS texract UI相同的键值数据

当我在AWS tesract UI上上传pdf时,我能够看到键值形式的所有数据,但是当我尝试使用AWS Textact java api无法获得键值形式的2页数据时。谁能帮我..!

我已经附加了UI屏幕截图,Java代码以从块,PDF文件获取键和值。

谢谢。

static void CreatetopicandQueue() {
    // create a new SNS topic
    snsTopicName = "AmazonTextractTopic" + Long.toString(System.currentTimeMillis());
    CreatetopicRequest createtopicRequest = new CreatetopicRequest(snsTopicName);
    CreatetopicResult createtopicResult = sns.createtopic(createtopicRequest);
    snsTopicArn = createtopicResult.getTopicArn();

    // Create a new SQS Queue
    sqsQueueName = "AmazonTextractQueue" + Long.toString(System.currentTimeMillis());
    final CreateQueueRequest createQueueRequest = new CreateQueueRequest(sqsQueueName);
    sqsQueueUrl = sqs.createQueue(createQueueRequest).getQueueUrl();
    sqsQueueArn = sqs.getQueueAttributes(sqsQueueUrl,Arrays.asList("QueueArn")).getAttributes().get("QueueArn");

    // Subscribe SQS queue to SNS topic
    String sqsSubscriptionArn = sns.subscribe(snsTopicArn,"sqs",sqsQueueArn).getSubscriptionArn();

    // Authorize queue
    Policy policy = new Policy().withStatements(
            new Statement(Effect.Allow).withPrincipals(Principal.AllUsers).withActions(SQSActions.SendMessage)
                    .withResources(new Resource(sqsQueueArn)).withConditions(new Condition().withType("ArnEquals")
                            .withConditionKey("aws:SourceArn").withValues(snsTopicArn)));

    Map queueAttributes = new HashMap();
    queueAttributes.put(QueueAttributeName.Policy.toString(),policy.toJson());
    sqs.setQueueAttributes(new SetQueueAttributesRequest(sqsQueueUrl,queueAttributes));

    System.out.println("Topic arn: " + snsTopicArn);
    System.out.println("Queue arn: " + sqsQueueArn);
    System.out.println("Queue url: " + sqsQueueUrl);
    System.out.println("Queue sub arn: " + sqsSubscriptionArn);
}

static void DeletetopicandQueue() {
    if (sqs != null) {
        sqs.deleteQueue(sqsQueueUrl);
        System.out.println("SQS queue deleted");
    }

    if (sns != null) {
        sns.deletetopic(snsTopicArn);
        System.out.println("SNS topic deleted");
    }
}

// Starts the processing of the input document.
static void ProcessDocument(String inBucket,String inDocument,String inRoleArn,Processtype type,LCA lca)
        throws Exception {
    bucket = inBucket;
    document = inDocument;
    roleArn = inRoleArn;
    StartDocumentAnalysis(bucket,document);
    System.out.println("Processing type: Analysis");

    System.out.println("Waiting for job: " + startJobId);
    // Poll queue for messages
    List<Message> messages = null;
    int dotLine = 0;
    boolean jobFound = false;

    // loop until the job status is published. Ignore other messages in queue.
    do {
        messages = sqs.receiveMessage(sqsQueueUrl).getMessages();
        if (dotLine++ < 40) {
            System.out.print(".");
        } else {
            System.out.println();
            dotLine = 0;
        }

        if (!messages.isEmpty()) {
            // Loop through messages received.
            for (Message message : messages) {
                String notification = message.getBody();

                // Get status and job id from notification.
                ObjectMapper mapper = new ObjectMapper();
                JsonNode jsonMessageTree = mapper.readTree(notification);
                JsonNode messageBodyText = jsonMessageTree.get("Message");
                ObjectMapper operationResultMapper = new ObjectMapper();
                JsonNode jsonResultTree = operationResultMapper.readTree(messageBodyText.textValue());
                JsonNode operationJobId = jsonResultTree.get("JobId");
                JsonNode operationStatus = jsonResultTree.get("Status");
                System.out.println("Job found was " + operationJobId);
                // Found job. Get the results and display.
                if (operationJobId.asText().equals(startJobId)) {
                    jobFound = true;
                    System.out.println("Job id: " + operationJobId);
                    System.out.println("Status : " + operationStatus.toString());
                    if (operationStatus.asText().equals("SUCCEEDED")) {
                        GetDocumentAnalysisResults(lca);
                        break;
                    } else {
                        System.out.println("Document analysis Failed");
                    }

                    sqs.deleteMessage(sqsQueueUrl,message.getReceiptHandle());
                }

                else {
                    System.out.println("Job received was not job " + startJobId);
                    // Delete unkNown message. Consider moving message to dead letter queue
                    sqs.deleteMessage(sqsQueueUrl,message.getReceiptHandle());
                }
            }
        } else {
            Thread.sleep(5000);
        }
    } while (!jobFound);

    System.out.println("Finished processing document");
}

private static void StartDocumentAnalysis(String bucket,String document) throws Exception {
    // Create notification channel
    NotificationChannel channel = new NotificationChannel().withSNSTopicArn(snsTopicArn).withRoleArn(roleArn);

    StartDocumentAnalysisRequest req = new StartDocumentAnalysisRequest().withFeatureTypes(FeatureType.FORMS)
            .withDocumentLocation(
                    new DocumentLocation().withS3Object(new S3Object().withBucket(bucket).withName(document)))
            .withJobTag("AnalyzingText").withNotificationChannel(channel);

    StartDocumentAnalysisResult startDocumentAnalysisResult = textract.startDocumentAnalysis(req);
    startJobId = startDocumentAnalysisResult.getJobId();
}

// Gets the results of processing started by StartDocumentAnalysis
private static void GetDocumentAnalysisResults(LCA lca) throws Exception {

    int maxResults = 1000;
    String paginationToken = null;
    GetDocumentAnalysisResult response = null;
    Boolean finished = false;
    // loops until pagination token is null
    while (finished == false) {
        GetDocumentAnalysisRequest documentAnalysisRequest = new GetDocumentAnalysisRequest().withJobId(startJobId)
                .withMaxResults(maxResults).withNextToken(paginationToken);

        response = textract.getDocumentAnalysis(documentAnalysisRequest);

        DocumentMetadata documentMetaData = response.getDocumentMetadata();
        List<Block> blocks = response.getBlocks();

        // get key and value maps
        List<Block> key_map = new ArrayList<Block>();
        List<Block> value_map = new ArrayList<Block>();
        List<Block> block_map = new ArrayList<Block>();
        for (Block block : blocks) {
            block_map.add(block);
            if (block.getBlockType().equals("KEY_VALUE_SET")) {

                if (block.getEntityTypes().contains("KEY")) {
                    key_map.add(block);
                } else {
                    value_map.add(block);
                }

            }

        }
        System.out.println("------------------------------");

        getKVMapRelationship(key_map,value_map,block_map);

        paginationToken = response.getNextToken();
        if (paginationToken == null)
            finished = true;
    }

}

@NotNull
public static LinkedHashMap<String,String> getKVMapRelationship(List<Block> key_map,List<Block> value_map,List<Block> block_map) throws IOException {
    LinkedHashMap<String,String> kvs = new LinkedHashMap();
    Block value_block;
    String key,val = "";
    for (Block key_block : key_map) {

        value_block = Find_value_block(key_block,value_map);
        key = Get_text(key_block,block_map);
        val = Get_text(value_block,block_map);
        kvs.put(key,val);
    }

    return kvs;

}

@NotNull
public static Block Find_value_block(Block block,List<Block> value_map) {
    Block value_block = new Block();
    for (Relationship relationship : block.getRelationships()) {
        if (relationship.getType().equals("VALUE")) {
            for (String value_id : relationship.getIds()) {
                for (Block value : value_map) {
                    if (value.getId().equals(value_id)) {
                        value_block = value;
                    }

                }

            }

        }

    }
    return value_block;

}

// null
@NotNull
public static String Get_text(Block result,List<Block> block_map) throws IOException {
    String text = "";
    Block word2 = new Block();
    try {

        if (result != null && CommonUtil.isListNotNullAndNotEmpty(result.getRelationships())) {

            for (Relationship relationship : result.getRelationships()) {

                if (relationship.getType().equals("CHILD")) {

                    for (String id : relationship.getIds()) {
                        System.out.println(id);
                        Block word = (block_map.stream().filter(x -> x.getId().equals(id)).findFirst()
                                .orElse(word2));

                        if (word.getBlockType() != null) {
                            if (word.getBlockType().equals("WORD")) {
                                text += word.getText() + " ";
                            } else if (word.getBlockType().equals("SELECTION_ELEMENT")) {

                                if (word.getSelectionStatus().equals("SELECTED")) {
                                    text += "X ";
                                }
                            }
                        } else {
                            System.out.println(relationship.getIds());
                            System.out.println(word);
                        }
                    }
                }
            }
        }

    } catch (Exception e) {
        e.printstacktrace();
    }
    return text;
}

enter image description here

PDF档案 enter link description here

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。