Python Programming: 2019

Thursday, 19 December 2019

Parse a textfile and generate tokens to a file in Python

Hi All

find the program.

------------------------------------

import re

def ngram_gen(s, n):

    s = s.lower()

    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)

    tokens = [token for token in s.split(" ") if token != ""]

    ngrams = zip(*[tokens[i:] for i in range(n)])

    return [" ".join(ngram) for ngram in ngrams]

fil=open("aaa.php", "r")

cou=fil.read()

am= ngram_gen(cou, n=1)

am

str1=''

for i in am:

  print (i)

  i=i.replace("\n","")

  i=i.replace("\t","")

  str1=str1+" "+i

print(str1)

f = open("hup.csv", "w")

f.write(str1)

f.close()

Tuesday, 22 October 2019

Canny Edge Detection in Python

Hi All,

See the code.
------------------------------------

import cv2
import numpy as np
from matplotlib import pyplot as plt
img = cv2.imread('tkm1.jpg',0)
edges = cv2.Canny(img,100,200)
plt.subplot(121),plt.imshow(img,cmap = 'gray')
plt.title('Original Image'), plt.xticks([]), plt.yticks([])
plt.subplot(122),plt.imshow(edges,cmap = 'gray')
plt.title('Edge Image'), plt.xticks([]), plt.yticks([])
plt.show()

Tuesday, 15 October 2019

n-gram implementation in Python using PDF file Operations

Hi All,
See the codes...
---------------------------------------------------------------------------------------------------------------------
# code to generte ngrams

import re
def ngram_gen(s, n):
s = s.lower()
s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
tokens = [token for token in s.split(" ") if token != ""]
ngrams = zip(*[tokens[i:] for i in range(n)])
return [" ".join(ngram) for ngram in ngrams]

------------------------------------------------------------------------------------------

# code to read from text file and generating ngrams

fil=open("hup.txt", "r")
cou=fil.read()
am= generate_ngrams(cou, n=3)
am

-------------------------------------------------------------------------------------------

# code to read from pdf file and generating ngrams

pip install PyPDF2
pip install textract
pip install nltk
import PyPDF2
pdfFileObj = open('new.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
z=pageObj.extractText()
bm= generate_ngrams(z, n=5)
bm

--------------------------------------------------------------------------------------------

# code to count plagiarism and store in array

j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
c=bm.count(i)
c=c+bm1.count(i)
print(i,c)
cnt[j]=c
j=j+1
print(cnt)
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")

---------------------------------------------------------------------------------------------

# code to combine all files and count plagiarism

import os, fnmatch
c1=""
listOfFiles = os.listdir('.')
pattern = "*.pdf"
for entry in listOfFiles:
if fnmatch.fnmatch(entry, pattern):
pdfFileObj = open(entry, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
z1=pageObj.extractText()
c1=c1+z1;
bm1= generate_ngrams(c1, n=5)
j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
c=bm1.count(i)
print(i,c)
cnt[j]=c
j=j+1
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")

--------------------------------------------------------------------------------------

Tuesday, 1 October 2019

Steps in Machine Learning

Hi All
Find the different steps in Machine Learning

---------------------------------
Train data

you are great,positive
its bad,negative
ok fine,neutral
-----------------------------------------------
Step1: Create Vocabulary

create a vocabulary=you are great its bad ok fine
--------------------------------------------------
Step 2; Training

1. create feature set

you:true are:true great:true its:false bad:false ok:false fine:false,positive

you:false are:false great:false its:true bad:true ok:false fine:false,negative

2. using feature set train the classifier(Naive bayes)

------------------------------------------------
Step3 :Testing
1. read the test data from console/csv/cloud/google drive
it was fine
2. create featureset of test data

you:false are:false great:false its:false bad:false ok:false fine:true

Testing

Monday, 30 September 2019

Remove stopwords in Python

Hi All,
See the code using set difference operation

--------------------------------
stopwords="is and it when which where to in have has "
testdata="it is bad and worst"
stoplist=stopwords.split()
testlist=testdata.split()
newtestlist=list(set(testlist)-set(stoplist))
print(newtestlist)
----------------------------------------

Sunday, 29 September 2019

Classification in Python without Machine Learning

HI All,
See the code...

The input is trainingdata.csv with the following content

high-temparature headache cough,fever
chest-pain high-pressure breathing-issue,heartattack
very-high-esr faint high-beta-count,cancer

-------------------------------------
import csv
list1=[]
class1=[]
stopwords="i am you we an in on where is are what which here"
slist=stopwords.split()
csvinput=open("trainingdata.csv","r")
reader=csv.reader(csvinput,delimiter=",")
for sym,label in reader:
list1.append(sym)
class1.append(label)
print(list1)
print(class1)
in1=input("Enter your symptoms")
inlist=in1.split()
newlist=list(set(inlist)-set(slist))
print(newlist)
j=0
for i in list1:
#print(i)
templist=i.split()
commonlist=list(set(templist)&set(newlist))
l1=len(commonlist)
percentage=l1/len(templist)*100
#print(templist)
print(class1[j],percentage)
j=j+1

Read tweets

Hi all
Use this code for reading tweets
____________________
import tweepy #https://github.com/tweepy/tweepy
import csv

#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = "-"
access_secret = ""

#def get_all_tweets(screen_name):
print("entered HUP")
#Twitter only allows access to a users most recent 3240 tweets with this method

#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

#initialize a list to hold all the tweepy Tweets
alltweets = []

#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = "sumeesh96283695",count=200)

#get_all_tweets("sumeesh96283695")
alltweets.extend(new_tweets)

#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1

print ("...%s tweets downloaded so far" % (len(alltweets)))

#transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]

for i in outtweets:
print(i)
print("-----------")

Saturday, 13 July 2019

Complete Program

Hi All,

Please find the complete Sentimental Analysis program

trainingdata.csv

i am fine,neutral
its great,positive
he is good,positive
so bad,negative
you are waste,negative

testdata.csv

1,raj,i am fine
2,manu,he is great
3,Raji,it is bad

------------------------------------------------------------------------------------------------------------------------

from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain
import csv

with open('trainingdata.csv','r') as csvinput:
    reader=csv.reader(csvinput,delimiter=",")
    rownum = 0 
    training_data = []

    for row in reader:
        training_data.append (row)
        rownum += 1

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

classifier = nbc.train(feature_set)

with open('testdata.csv','r') as csvinput:
    with open('data.csv', 'w') as csvoutput:
        writer = csv.writer(csvoutput, lineterminator='\n')
        reader1 = csv.reader(csvinput)

        all = []
        row = next(reader1)
        

        for row in reader1:
            test_sentence = row[2]
            featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}
            print ("test_sent:",test_sentence)
            print ("tag:",classifier.classify(featurized_test_sentence))
            row.append(classifier.classify(featurized_test_sentence))
            all.append(row)
        writer.writerows(all)

Friday, 12 July 2019

Dear All,

Find the steps in Twitter Sentimental Analysis using Python

1. Import necessary packages

from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain
import csv

2. Read the input file using csv reader and generate a list of those tweets

3. Generate a vocabulary

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

4. Generate training data

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

5. Train the classifier

classifier = nbc.train(feature_set)

6. Generate output csv file

writer = csv.writer(csvoutput, lineterminator='\n')

7. Generate Test Input

featurized_test_sentence = {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}

8. Classfiy and create output data

row.append(classifier.classify(featurized_test_sentence))
all.append(row)

9. Flush output data to an output csv file

writer.writerows(all)