Python Programming: n-gram implementation in Python using PDF file Operations

Hi All,
See the codes...
---------------------------------------------------------------------------------------------------------------------
# code to generte ngrams

import re
def ngram_gen(s, n):
s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
ngrams = zip(*[tokens[i:] for i in range(n)])
return [" ".join(ngram) for ngram in ngrams]

------------------------------------------------------------------------------------------

# code to read from text file and generating ngrams

fil=open("hup.txt", "r")
cou=fil.read()
am= generate_ngrams(cou, n=3)
am

-------------------------------------------------------------------------------------------

# code to read from pdf file and generating ngrams

pip install PyPDF2
pip install textract
pip install nltk
import PyPDF2
pdfFileObj = open('new.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
z=pageObj.extractText()
bm= generate_ngrams(z, n=5)
bm

--------------------------------------------------------------------------------------------

# code to count plagiarism and store in array

j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
c=bm.count(i)
c=c+bm1.count(i)
print(i,c)
cnt[j]=c
j=j+1
print(cnt)
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")

---------------------------------------------------------------------------------------------

# code to combine all files and count plagiarism

import os, fnmatch
c1=""
listOfFiles = os.listdir('.')
pattern = "*.pdf"
for entry in listOfFiles:
if fnmatch.fnmatch(entry, pattern):
pdfFileObj = open(entry, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
z1=pageObj.extractText()
c1=c1+z1;
bm1= generate_ngrams(c1, n=5)
j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
c=bm1.count(i)
print(i,c)
cnt[j]=c
j=j+1
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")

--------------------------------------------------------------------------------------

Python Programming

Tuesday, 15 October 2019

n-gram implementation in Python using PDF file Operations

No comments:

Post a Comment