Tuesday, 15 October 2019

n-gram implementation in Python using PDF file Operations

Hi All,
See the codes...
---------------------------------------------------------------------------------------------------------------------
# code to generte ngrams

import re
def ngram_gen(s, n):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


------------------------------------------------------------------------------------------

# code to read from text file and generating ngrams

fil=open("hup.txt", "r")
cou=fil.read()
am= generate_ngrams(cou, n=3)
am


-------------------------------------------------------------------------------------------

# code to read from pdf file and generating ngrams

pip install PyPDF2
pip install textract
pip install nltk
import PyPDF2
pdfFileObj = open('new.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
z=pageObj.extractText()
bm= generate_ngrams(z, n=5)
bm


--------------------------------------------------------------------------------------------

# code to count plagiarism and store in array

j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
  c=bm.count(i)
  c=c+bm1.count(i)
  print(i,c)
  cnt[j]=c
  j=j+1
print(cnt)
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
  print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")


---------------------------------------------------------------------------------------------

# code to combine all files and count plagiarism

import os, fnmatch
c1=""
listOfFiles = os.listdir('.')
pattern = "*.pdf"
for entry in listOfFiles:
    if fnmatch.fnmatch(entry, pattern):
            pdfFileObj = open(entry, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            print(pdfReader.numPages) 
            pageObj = pdfReader.getPage(0) 
            z1=pageObj.extractText()
            c1=c1+z1;
bm1= generate_ngrams(c1, n=5)
j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
  c=bm1.count(i)
  print(i,c)
  cnt[j]=c
  j=j+1
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
  print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")
           

--------------------------------------------------------------------------------------

No comments:

Post a Comment