commit c35ff9d1dd3effa81cf914073c448bc83889d7a3 Author: Shihaam Abdul Rahman Date: Fri Aug 2 22:51:03 2024 +0500 init - huge mess diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0c5c09b --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +*.jpg +*.jpeg +*.png +*.save +venv/* diff --git a/bml.py b/bml.py new file mode 100644 index 0000000..a410a95 --- /dev/null +++ b/bml.py @@ -0,0 +1,25 @@ +import sys +import pytesseract +from PIL import Image +import json + +def ocr_image_to_json(image_path): + try: + # Open the image file + image = Image.open(image_path) + + # Perform OCR on the image + ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + + # Convert the OCR result to JSON + ocr_json = json.dumps(ocr_result, indent=4) + print(ocr_json) + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python ocr_to_json.py ") + else: + image_path = sys.argv[1] + ocr_image_to_json(image_path) diff --git a/printall.py b/printall.py new file mode 100644 index 0000000..241b020 --- /dev/null +++ b/printall.py @@ -0,0 +1,25 @@ +import sys +import pytesseract +from PIL import Image + +def ocr_image_to_text(image_path): + try: + # Open the image file + image = Image.open(image_path) + + # Perform OCR on the image + ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + + # Extract and print the text portion + text = [word for word in ocr_result['text'] if word.strip() != ""] + print(" ".join(text)) + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python ocr_to_text.py ") + else: + image_path = sys.argv[1] + ocr_image_to_text(image_path) + diff --git a/processed.py b/processed.py new file mode 100644 index 0000000..0480964 --- /dev/null +++ b/processed.py @@ -0,0 +1,70 @@ +import sys +import pytesseract +from PIL import Image +import re +import json + +def ocr_image_to_json(image_path): + try: + # Open the image file + image = Image.open(image_path) + + # Perform OCR on the image + ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) + text = " ".join([word for word in ocr_result['text'] if word.strip() != ""]) + + # Debug: Print the extracted text + print("Extracted Text:", text) + + # Regex patterns + reference_pattern = r"BLAZ\d{12}" + date_pattern = r"\d{2}/\d{2}/\d{4}\s?\d{2}:\d{2}" + amount_pattern = r"MVR\s?(\d+\.\d{2})" + to_address_pattern = r"(9\d{17}|7\d{13})" + + # Extracting data using regex + reference = re.search(reference_pattern, text) + transaction_date = re.search(date_pattern, text) + amount = re.search(amount_pattern, text) + to_address = re.search(to_address_pattern, text) + + # Debug: Print the matched regex groups + print("Reference:", reference.group() if reference else "Not found") + print("Transaction Date:", transaction_date.group() if transaction_date else "Not found") + print("Amount:", amount.group(1) if amount else "Not found") + print("To Address:", to_address.group() if to_address else "Not found") + + # Find 'from' and 'to' names + names_pattern = re.compile(r"([A-Z]+\s?[A-Z.]+)") + names = names_pattern.findall(text) + + # Debug: Print the names found + print("Names Found:", names) + + from_name, to_name = "", "" + if len(names) > 1: + from_name, to_name = names[0], names[1] + + # Prepare the result in JSON format + result = { + "status": "SUCCESS", + "message": "Thank you. Transfer transaction is successful.", + "reference": reference.group() if reference else "", + "transaction_date": transaction_date.group() if transaction_date else "", + "from": from_name, + "to_name": to_name, + "to_account": to_address.group() if to_address else "", + "amount": amount.group(1) if amount else "" + } + + # Print the result as JSON + print(json.dumps(result, indent=4)) + except Exception as e: + print(f"Error: {e}") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python ocr_to_json.py ") + else: + image_path = sys.argv[1] + ocr_image_to_json(image_path)