Inference-APP-Document-Understanding-at-paragraphlevel-v1

Runtime error

App Files Files Community

pierreguillou commited on Feb 14, 2023

Commit

c184a3c

1 Parent(s): 17d6209

Update files/functions.py

Browse files

Files changed (1) hide show

files/functions.py +12 -12

files/functions.py CHANGED Viewed

@@ -416,7 +416,7 @@ def extraction_data_from_image(images):
             print(f"There was an error within the extraction of PDF text by the OCR!")
         else:
             from datasets import Dataset
-            dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_line": line_boxes_list})
             print(f"The text data was successfully extracted by the OCR!")
@@ -433,7 +433,7 @@ def prepare_inference_features(example):
   # batch_page_hash = example["page_hash"]
   batch_images_ids = example["images_ids"]
   batch_images = example["images"]
-  batch_bboxes_line = example["bboxes_line"]
   batch_texts = example["texts"]
   batch_images_size = [image.size for image in batch_images]
@@ -443,12 +443,12 @@ def prepare_inference_features(example):
   if not isinstance(batch_images_ids, list):
     batch_images_ids = [batch_images_ids]
     batch_images = [batch_images]
-    batch_bboxes_line = [batch_bboxes_line]
     batch_texts = [batch_texts]
     batch_width, batch_height = [batch_width], [batch_height]
   # process all images of the batch
-  for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_line, batch_texts, batch_width, batch_height)):
     tokens_list = []
     bboxes_list = []
@@ -457,11 +457,11 @@ def prepare_inference_features(example):
       texts, boxes = [texts], [boxes]
     # convert boxes to original
-    normalize_bboxes_line = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
     # sort boxes with texts
     # we want sorted lists from top to bottom of the image
-    boxes, texts = sort_data_wo_labels(normalize_bboxes_line, texts)
     count = 0
     for box, text in zip(boxes, texts):
@@ -593,8 +593,8 @@ def predictions_token_level(images, custom_encoded_dataset):
 from functools import reduce
-# Get predictions (line level)
-def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
     ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
     bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
@@ -688,7 +688,7 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
     else:
         print("An error occurred while getting predictions!")
-# Get labeled images with lines bounding boxes
 def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
     labeled_images = list()
@@ -763,7 +763,7 @@ def get_encoded_chunk_inference(index_chunk=None):
     del input_ids_dict[str(bboxes_list[-1])]
     bboxes_list = bboxes_list[:-1]
-  # get texts by line
   input_ids_list = input_ids_dict.values()
   texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
@@ -773,7 +773,7 @@ def get_encoded_chunk_inference(index_chunk=None):
   return image, df, num_tokens, page_no, num_pages
 # display chunk of PDF image and its data
-def display_chunk_lines_inference(index_chunk=None):
   # get image and image data
   image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
@@ -786,7 +786,7 @@ def display_chunk_lines_inference(index_chunk=None):
   print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
   # display image with bounding boxes
-  print(">> PDF image with bounding boxes of lines\n")
   draw = ImageDraw.Draw(image)
   labels = list()

             print(f"There was an error within the extraction of PDF text by the OCR!")
         else:
             from datasets import Dataset
+            dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_par": par_boxes_list})
             print(f"The text data was successfully extracted by the OCR!")
   # batch_page_hash = example["page_hash"]
   batch_images_ids = example["images_ids"]
   batch_images = example["images"]
+  batch_bboxes_par = example["bboxes_par"]
   batch_texts = example["texts"]
   batch_images_size = [image.size for image in batch_images]
   if not isinstance(batch_images_ids, list):
     batch_images_ids = [batch_images_ids]
     batch_images = [batch_images]
+    batch_bboxes_par = [batch_bboxes_par]
     batch_texts = [batch_texts]
     batch_width, batch_height = [batch_width], [batch_height]
   # process all images of the batch
+  for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_par, batch_texts, batch_width, batch_height)):
     tokens_list = []
     bboxes_list = []
       texts, boxes = [texts], [boxes]
     # convert boxes to original
+    normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
     # sort boxes with texts
     # we want sorted lists from top to bottom of the image
+    boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
     count = 0
     for box, text in zip(boxes, texts):
 from functools import reduce
+# Get predictions (par level)
+def predictions_par_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
     ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
     bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
     else:
         print("An error occurred while getting predictions!")
+# Get labeled images with paragraphs bounding boxes
 def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
     labeled_images = list()
     del input_ids_dict[str(bboxes_list[-1])]
     bboxes_list = bboxes_list[:-1]
+  # get texts by paragraph
   input_ids_list = input_ids_dict.values()
   texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
   return image, df, num_tokens, page_no, num_pages
 # display chunk of PDF image and its data
+def display_chunk_paragraphs_inference(index_chunk=None):
   # get image and image data
   image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
   print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
   # display image with bounding boxes
+  print(">> PDF image with bounding boxes of paragraphs\n")
   draw = ImageDraw.Draw(image)
   labels = list()