-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
140 lines (119 loc) · 5.47 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
# Import PdfReader for reading PDF files
from PyPDF2 import PdfReader # type: ignore
# Import TfidfVectorizer for converting text to TF-IDF vectors
from sklearn.feature_extraction.text import TfidfVectorizer # type: ignore
# Import cosine_similarity for computing similarity between vectors
from sklearn.metrics.pairwise import cosine_similarity # type: ignore
# Import re for regular expression operations
import re
def extractText(pdf_path):
"""
Extracts text from a PDF file.
Args:
pdf_path (str): The file path of the PDF.
Returns:
str: The extracted text from the PDF.
"""
text = ""
try:
# Open the PDF file in read-binary mode
with open(pdf_path, "rb") as file:
reader = PdfReader(file) # Create a PdfReader object
for page in reader.pages:
# Extract text from each page and append to text variable
text += page.extract_text() or ""
except Exception as e:
print(f"Error reading {pdf_path}: {e}") # Handle and report errors
return text
def extractFeatures(text):
"""
Extracts specific features from the text using regular expressions.
Args:
text (str): The text from which to extract features.
Returns:
dict: A dictionary containing extracted features such as invoice number, date, and amount.
"""
features = {}
features['invoice_number'] = re.search(r'Invoice Number:\s*(\S+)', text)
features['date'] = re.search(r'Date:\s*(\d{4}-\d{2}-\d{2})', text)
features['amount'] = re.search(r'Amount:\s*\$([\d,\.]+)', text)
return features
def calculateSimilarity(text1, text2):
"""
Calculates the cosine similarity between two text documents.
Args:
text1 (str): The first text document.
text2 (str): The second text document.
Returns:
float: The cosine similarity score between the two text documents.
"""
vectorizer = TfidfVectorizer() # Create a TF-IDF vectorizer
vectors = vectorizer.fit_transform([text1, text2]) # Transform texts into TF-IDF vectors
return cosine_similarity(vectors[0:1], vectors[1:2])[0][0] # Compute cosine similarity
class InvoiceDatabase:
"""
A class to manage a collection of invoices and find the most similar invoice.
"""
def __init__(self):
self.invoices = [] # Initialize an empty list to store invoices
def addInvoice(self, invoice):
"""
Adds an invoice to the database.
Args:
invoice (dict): A dictionary containing the invoice text and filename.
"""
self.invoices.append(invoice) # Append the invoice to the list
def findMostSimilar(self, inputInvoice):
"""
Finds the most similar invoice to a given input invoice.
Args:
inputInvoice (dict): A dictionary containing the input invoice text and filename.
Returns:
tuple: A tuple containing the most similar invoice and its similarity score.
"""
maxSimilarity = 0 # Initialize the maximum similarity score
mostSimilarInvoice = None # Initialize the most similar invoice
for invoice in self.invoices:
if invoice['filename'] != inputInvoice['filename']: # Skip comparing with itself
similarity = calculateSimilarity(inputInvoice['text'], invoice['text']) # Calculate similarity
if similarity > maxSimilarity: # Check if the similarity is the highest so far
maxSimilarity = similarity
mostSimilarInvoice = invoice
return mostSimilarInvoice, maxSimilarity # Return the most similar invoice and its similarity score
if __name__ == "__main__":
db = InvoiceDatabase() # Create an instance of InvoiceDatabase
# List of PDF files to be processed
files = [
'train/2024.03.15_0954.pdf',
'train/2024.03.15_1145.pdf',
'train/Faller_8.PDF',
'train/invoice_77073.pdf',
'train/invoice_102856.pdf'
]
# Process each PDF file
for pdfFile in files:
try:
invoice_text = extractText(pdfFile) # Extract text from the PDF
db.addInvoice({'text': invoice_text, 'filename': pdfFile}) # Add the invoice to the database
print(f"Successfully processed {pdfFile}")
except Exception as e:
print(f"Error processing {pdfFile}: {str(e)}") # Handle and report errors
# Choose one file as the input invoice (for example)
inputFile = 'test/invoice_102857.pdf'
inputInvoiceText = extractText(inputFile) # Extract text from the input invoice
inputInvoice = {'text': inputInvoiceText, 'filename': inputFile} # Create an invoice dictionary
# Find the most similar invoice
mostSimilar, similarityScore = db.findMostSimilar(inputInvoice)
if mostSimilar:
print(f"Most similar invoice: {mostSimilar['filename']}")
print(f"Similarity score: {similarityScore}")
else:
print("No similar invoice found.")
# Extract features from the input invoice
input_features = extractFeatures(inputInvoiceText)
print("\nInput Invoice Features:")
for key, value in input_features.items():
if value:
print(f"{key}: {value.group(1)}") # Print the extracted feature if found
else:
print(f"{key}: Not found") # Print 'Not found' if the feature was not extracted