forked from mukeshrathore/SmartDoc-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifile.py
More file actions
93 lines (74 loc) · 3.18 KB
/
classifile.py
File metadata and controls
93 lines (74 loc) · 3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# -*- coding: utf-8 -*-
"""DocMan.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1WtE6tnhXixGMZGL5mS9f_qqgtxS7Pr4D
"""
# pip install transformers==4.35.2 datasets==2.15.0 scikit-learn==1.2.2
# pip install langchain-community
# pip install pypdf torch
from transformers import pipeline
import torch
from torch import Tensor
from langchain_community.document_loaders import PyPDFLoader
from pypdf import PdfReader
import sys
import os
def PDFloader(link):
pdfreader = PdfReader(link)
# read text from pdf
raw_text = ""
for i, page in enumerate(pdfreader.pages):
content = page.extract_text()
if content:
raw_text += content
num_of_pages = len(pdfreader.pages)
num_of_words = len(raw_text)
num_new_lines = raw_text.count("\n")
period_counts = raw_text.count(".")
print("Number of pages:", num_of_pages, "\n"
"Number of words:", num_of_words, "\n"
"Number of new lines:", num_new_lines, "\n"
"Number of periods:" ,period_counts)
print(raw_text[0:100])
return raw_text[0:100] #first hundred tokens so as to not overwhelm
def classification(sequences):
device = "cuda" if torch.cuda.is_available() else "cpu"
classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli",
device=device)
candidate_labels = ["Technology", "Business", "Legal", "Human Resources", "Tax", "Finance"]
#candidate_labels = ["Brokerage account agreements", "Adding account features", "Transfers distributions and payments", "Retirement account forms", "Tax and legal forms", "Estate processing forms", "Miscellaneous forms"]
results = classifier(sequences, candidate_labels)
return(results)
def Get_Top_Category(string_now):
index_start = string_now.index('labels')
index_end = index_start + 209
reformed = string_now[index_start+9:index_end]
reformed = reformed.replace('[', '')
reformed = reformed.replace(']', '')
reformed = reformed.replace("'", '')
reformed = reformed.split(',')
return(reformed[0])
def Output(category, path):
start_index = path.index('src/')
return path[start_index + 4:] + "|" + category
if __name__ == "__main__":
#main function
#Test Case 1:
#classification(["Microsoft Pushes Off SP2 Release Microsoft will delay the release of its SP2 update for another week to fix software glitches."])
#Test Case 2:
#path = r'/usr/local/src/Expatriate_Certificate.pdf'
#Test Case 3:
# path = r'/usr/local/src/Tax_and_legal_form.pdf'
directory = sys.argv[1] if len(sys.argv) > 1 else ""
# path = r'c:/Users/mukes/Downloads/SmartDoc/'+''+'/Estate Processing Checklist - 595906 (7).pdf'
# path1 = os.path.join(r'c:/Users/mukes/Downloads/SmartDoc',directory)
path = os.path.join(directory, 'Estate Processing Checklist - 595906 (7).pdf')
preliminary_string = classification(PDFloader(path))
category = Get_Top_Category(str(preliminary_string))
print(preliminary_string)
# print(Output(category, path))
import subprocess #This module allows you to run processes
# Trigger another Python script
subprocess.run(["python", "./infocus.py"])