Commit
·
c184a3c
1
Parent(s):
17d6209
Update files/functions.py
Browse files- files/functions.py +12 -12
files/functions.py
CHANGED
|
@@ -416,7 +416,7 @@ def extraction_data_from_image(images):
|
|
| 416 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|
| 417 |
else:
|
| 418 |
from datasets import Dataset
|
| 419 |
-
dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "
|
| 420 |
|
| 421 |
print(f"The text data was successfully extracted by the OCR!")
|
| 422 |
|
|
@@ -433,7 +433,7 @@ def prepare_inference_features(example):
|
|
| 433 |
# batch_page_hash = example["page_hash"]
|
| 434 |
batch_images_ids = example["images_ids"]
|
| 435 |
batch_images = example["images"]
|
| 436 |
-
|
| 437 |
batch_texts = example["texts"]
|
| 438 |
batch_images_size = [image.size for image in batch_images]
|
| 439 |
|
|
@@ -443,12 +443,12 @@ def prepare_inference_features(example):
|
|
| 443 |
if not isinstance(batch_images_ids, list):
|
| 444 |
batch_images_ids = [batch_images_ids]
|
| 445 |
batch_images = [batch_images]
|
| 446 |
-
|
| 447 |
batch_texts = [batch_texts]
|
| 448 |
batch_width, batch_height = [batch_width], [batch_height]
|
| 449 |
|
| 450 |
# process all images of the batch
|
| 451 |
-
for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids,
|
| 452 |
tokens_list = []
|
| 453 |
bboxes_list = []
|
| 454 |
|
|
@@ -457,11 +457,11 @@ def prepare_inference_features(example):
|
|
| 457 |
texts, boxes = [texts], [boxes]
|
| 458 |
|
| 459 |
# convert boxes to original
|
| 460 |
-
|
| 461 |
|
| 462 |
# sort boxes with texts
|
| 463 |
# we want sorted lists from top to bottom of the image
|
| 464 |
-
boxes, texts = sort_data_wo_labels(
|
| 465 |
|
| 466 |
count = 0
|
| 467 |
for box, text in zip(boxes, texts):
|
|
@@ -593,8 +593,8 @@ def predictions_token_level(images, custom_encoded_dataset):
|
|
| 593 |
|
| 594 |
from functools import reduce
|
| 595 |
|
| 596 |
-
# Get predictions (
|
| 597 |
-
def
|
| 598 |
|
| 599 |
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
| 600 |
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
|
@@ -688,7 +688,7 @@ def predictions_line_level(dataset, outputs, images_ids_list, chunk_ids, input_i
|
|
| 688 |
else:
|
| 689 |
print("An error occurred while getting predictions!")
|
| 690 |
|
| 691 |
-
# Get labeled images with
|
| 692 |
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
| 693 |
|
| 694 |
labeled_images = list()
|
|
@@ -763,7 +763,7 @@ def get_encoded_chunk_inference(index_chunk=None):
|
|
| 763 |
del input_ids_dict[str(bboxes_list[-1])]
|
| 764 |
bboxes_list = bboxes_list[:-1]
|
| 765 |
|
| 766 |
-
# get texts by
|
| 767 |
input_ids_list = input_ids_dict.values()
|
| 768 |
texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
|
| 769 |
|
|
@@ -773,7 +773,7 @@ def get_encoded_chunk_inference(index_chunk=None):
|
|
| 773 |
return image, df, num_tokens, page_no, num_pages
|
| 774 |
|
| 775 |
# display chunk of PDF image and its data
|
| 776 |
-
def
|
| 777 |
|
| 778 |
# get image and image data
|
| 779 |
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
|
|
@@ -786,7 +786,7 @@ def display_chunk_lines_inference(index_chunk=None):
|
|
| 786 |
print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
|
| 787 |
|
| 788 |
# display image with bounding boxes
|
| 789 |
-
print(">> PDF image with bounding boxes of
|
| 790 |
draw = ImageDraw.Draw(image)
|
| 791 |
|
| 792 |
labels = list()
|
|
|
|
| 416 |
print(f"There was an error within the extraction of PDF text by the OCR!")
|
| 417 |
else:
|
| 418 |
from datasets import Dataset
|
| 419 |
+
dataset = Dataset.from_dict({"images_ids": images_ids_list, "images": images_list, "page_no": page_no_list, "num_pages": num_pages_list, "texts": lines_list, "bboxes_par": par_boxes_list})
|
| 420 |
|
| 421 |
print(f"The text data was successfully extracted by the OCR!")
|
| 422 |
|
|
|
|
| 433 |
# batch_page_hash = example["page_hash"]
|
| 434 |
batch_images_ids = example["images_ids"]
|
| 435 |
batch_images = example["images"]
|
| 436 |
+
batch_bboxes_par = example["bboxes_par"]
|
| 437 |
batch_texts = example["texts"]
|
| 438 |
batch_images_size = [image.size for image in batch_images]
|
| 439 |
|
|
|
|
| 443 |
if not isinstance(batch_images_ids, list):
|
| 444 |
batch_images_ids = [batch_images_ids]
|
| 445 |
batch_images = [batch_images]
|
| 446 |
+
batch_bboxes_par = [batch_bboxes_par]
|
| 447 |
batch_texts = [batch_texts]
|
| 448 |
batch_width, batch_height = [batch_width], [batch_height]
|
| 449 |
|
| 450 |
# process all images of the batch
|
| 451 |
+
for num_batch, (image_id, boxes, texts, width, height) in enumerate(zip(batch_images_ids, batch_bboxes_par, batch_texts, batch_width, batch_height)):
|
| 452 |
tokens_list = []
|
| 453 |
bboxes_list = []
|
| 454 |
|
|
|
|
| 457 |
texts, boxes = [texts], [boxes]
|
| 458 |
|
| 459 |
# convert boxes to original
|
| 460 |
+
normalize_bboxes_par = [normalize_box(upperleft_to_lowerright(box), width, height) for box in boxes]
|
| 461 |
|
| 462 |
# sort boxes with texts
|
| 463 |
# we want sorted lists from top to bottom of the image
|
| 464 |
+
boxes, texts = sort_data_wo_labels(normalize_bboxes_par, texts)
|
| 465 |
|
| 466 |
count = 0
|
| 467 |
for box, text in zip(boxes, texts):
|
|
|
|
| 593 |
|
| 594 |
from functools import reduce
|
| 595 |
|
| 596 |
+
# Get predictions (par level)
|
| 597 |
+
def predictions_par_level(dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes):
|
| 598 |
|
| 599 |
ten_probs_dict, ten_input_ids_dict, ten_bboxes_dict = dict(), dict(), dict()
|
| 600 |
bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = dict(), dict(), dict(), dict()
|
|
|
|
| 688 |
else:
|
| 689 |
print("An error occurred while getting predictions!")
|
| 690 |
|
| 691 |
+
# Get labeled images with paragraphs bounding boxes
|
| 692 |
def get_labeled_images(dataset, images_ids_list, bboxes_list_dict, probs_dict_dict):
|
| 693 |
|
| 694 |
labeled_images = list()
|
|
|
|
| 763 |
del input_ids_dict[str(bboxes_list[-1])]
|
| 764 |
bboxes_list = bboxes_list[:-1]
|
| 765 |
|
| 766 |
+
# get texts by paragraph
|
| 767 |
input_ids_list = input_ids_dict.values()
|
| 768 |
texts_list = [tokenizer.decode(input_ids) for input_ids in input_ids_list]
|
| 769 |
|
|
|
|
| 773 |
return image, df, num_tokens, page_no, num_pages
|
| 774 |
|
| 775 |
# display chunk of PDF image and its data
|
| 776 |
+
def display_chunk_paragraphs_inference(index_chunk=None):
|
| 777 |
|
| 778 |
# get image and image data
|
| 779 |
image, df, num_tokens, page_no, num_pages = get_encoded_chunk_inference(index_chunk=index_chunk)
|
|
|
|
| 786 |
print(f'Chunk ({num_tokens} tokens) of the PDF (page: {page_no+1} / {num_pages})\n')
|
| 787 |
|
| 788 |
# display image with bounding boxes
|
| 789 |
+
print(">> PDF image with bounding boxes of paragraphs\n")
|
| 790 |
draw = ImageDraw.Draw(image)
|
| 791 |
|
| 792 |
labels = list()
|