Thursday, 19 December 2019

Parse a textfile and generate tokens to a file in Python

Hi All

find the program.

------------------------------------
import re
def ngram_gen(sn):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]'' ', s)
    tokens = [token for token in s.split(" "if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


fil=open("aaa.php""r")
cou=fil.read()
am= ngram_gen(cou, n=1)
am

str1=''
for i in am:
  print (i)
  i=i.replace("\n","")
  i=i.replace("\t","")
  str1=str1+" "+i
print(str1)
f = open("hup.csv""w")
f.write(str1)
f.close()

Tuesday, 22 October 2019

Canny Edge Detection in Python

Hi All,

See the code.
------------------------------------
import cv2
import numpy as np
from matplotlib import pyplot as plt
img = cv2.imread('tkm1.jpg',0)
edges = cv2.Canny(img,100,200)
plt.subplot(121),plt.imshow(img,cmap = 'gray')
plt.title('Original Image'), plt.xticks([]), plt.yticks([])
plt.subplot(122),plt.imshow(edges,cmap = 'gray')
plt.title('Edge Image'), plt.xticks([]), plt.yticks([])
plt.show()

Tuesday, 15 October 2019

n-gram implementation in Python using PDF file Operations

Hi All,
See the codes...
---------------------------------------------------------------------------------------------------------------------
# code to generte ngrams

import re
def ngram_gen(s, n):
    s = s.lower()
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]


------------------------------------------------------------------------------------------

# code to read from text file and generating ngrams

fil=open("hup.txt", "r")
cou=fil.read()
am= generate_ngrams(cou, n=3)
am


-------------------------------------------------------------------------------------------

# code to read from pdf file and generating ngrams

pip install PyPDF2
pip install textract
pip install nltk
import PyPDF2
pdfFileObj = open('new.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
print(pdfReader.numPages)
pageObj = pdfReader.getPage(0)
z=pageObj.extractText()
bm= generate_ngrams(z, n=5)
bm


--------------------------------------------------------------------------------------------

# code to count plagiarism and store in array

j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
  c=bm.count(i)
  c=c+bm1.count(i)
  print(i,c)
  cnt[j]=c
  j=j+1
print(cnt)
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
  print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")


---------------------------------------------------------------------------------------------

# code to combine all files and count plagiarism

import os, fnmatch
c1=""
listOfFiles = os.listdir('.')
pattern = "*.pdf"
for entry in listOfFiles:
    if fnmatch.fnmatch(entry, pattern):
            pdfFileObj = open(entry, 'rb')
            pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
            print(pdfReader.numPages) 
            pageObj = pdfReader.getPage(0) 
            z1=pageObj.extractText()
            c1=c1+z1;
bm1= generate_ngrams(c1, n=5)
j=0
cnt=[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
for i in am:
  c=bm1.count(i)
  print(i,c)
  cnt[j]=c
  j=j+1
print(cnt)
a=cnt[0:34].count(0)
p=((j-a)/j*100)
if p>25:
  print("\nplagiarism detected!!\n plagiarism level:",round(p),"%")
           

--------------------------------------------------------------------------------------

Tuesday, 1 October 2019

Steps in Machine Learning

Hi All
Find the different steps in Machine Learning

---------------------------------
Train data

you are great,positive
its bad,negative
ok fine,neutral
-----------------------------------------------
Step1: Create Vocabulary

create a vocabulary=you are great its bad ok fine
--------------------------------------------------
Step 2; Training

1. create feature set

you:true are:true great:true its:false bad:false ok:false fine:false,positive

you:false are:false great:false its:true bad:true ok:false fine:false,negative

2. using feature set train the classifier(Naive bayes)

------------------------------------------------
Step3 :Testing
1. read the test data from console/csv/cloud/google drive
  it was fine
2. create featureset of test data

 you:false are:false great:false its:false bad:false ok:false fine:true



Testing

Monday, 30 September 2019

Remove stopwords in Python

Hi All,
See the code using set difference operation

--------------------------------
stopwords="is and it when which where to in have has "
testdata="it is bad and worst"
stoplist=stopwords.split()
testlist=testdata.split()
newtestlist=list(set(testlist)-set(stoplist))
print(newtestlist)
----------------------------------------

Sunday, 29 September 2019

Classification in Python without Machine Learning

HI All,
See the code...

The input is trainingdata.csv  with the following content

high-temparature headache cough,fever
chest-pain high-pressure breathing-issue,heartattack
very-high-esr faint high-beta-count,cancer

-------------------------------------
import csv
list1=[]
class1=[]
stopwords="i am you we an in on where is are what which here"
slist=stopwords.split()
csvinput=open("trainingdata.csv","r")
reader=csv.reader(csvinput,delimiter=",")
for sym,label in reader:
  list1.append(sym)
  class1.append(label)
print(list1)
print(class1)
in1=input("Enter your symptoms")
inlist=in1.split()
newlist=list(set(inlist)-set(slist))
print(newlist)
j=0
for i in list1:
  #print(i)
  templist=i.split()
  commonlist=list(set(templist)&set(newlist))
  l1=len(commonlist)
  percentage=l1/len(templist)*100
  #print(templist)
  print(class1[j],percentage)
  j=j+1
  

Read tweets

Hi all
Use this code for reading tweets
____________________
import tweepy #https://github.com/tweepy/tweepy
import csv

#Twitter API credentials
consumer_key = ""
consumer_secret = ""
access_key = "-"
access_secret = ""


#def get_all_tweets(screen_name):
print("entered HUP")
#Twitter only allows access to a users most recent 3240 tweets with this method

#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

#initialize a list to hold all the tweepy Tweets
alltweets = []

#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = "sumeesh96283695",count=200)
 
#get_all_tweets("sumeesh96283695")
alltweets.extend(new_tweets)

#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1

print ("...%s tweets downloaded so far" % (len(alltweets)))

#transform the tweepy tweets into a 2D array that will populate the csv
outtweets = [[tweet.id_str, tweet.created_at, tweet.text.encode("utf-8")] for tweet in alltweets]















for i in outtweets:
  print(i)
  print("-----------")

Saturday, 13 July 2019

Complete Program

Hi All,

Please find the complete Sentimental Analysis program

trainingdata.csv

i am fine,neutral
its great,positive
he is good,positive
so bad,negative
you are waste,negative

testdata.csv

1,raj,i am fine
2,manu,he is great
3,Raji,it is bad

------------------------------------------------------------------------------------------------------------------------



from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain
import csv

with open('trainingdata.csv','r') as csvinput:
    reader=csv.reader(csvinput,delimiter=",")
    rownum = 0 
    training_data = []

    for row in reader:
        training_data.append (row)
        rownum += 1

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

classifier = nbc.train(feature_set)

with open('testdata.csv','r') as csvinput:
    with open('data.csv', 'w') as csvoutput:
        writer = csv.writer(csvoutput, lineterminator='\n')
        reader1 = csv.reader(csvinput)

        all = []
        row = next(reader1)
        

        for row in reader1:
            test_sentence = row[2]
            featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}
            print ("test_sent:",test_sentence)
            print ("tag:",classifier.classify(featurized_test_sentence))
            row.append(classifier.classify(featurized_test_sentence))
            all.append(row)
        writer.writerows(all)

Friday, 12 July 2019

Dear All,

Find the steps in Twitter Sentimental Analysis using Python

1. Import necessary packages

from nltk import NaiveBayesClassifier as nbc
from nltk.tokenize import word_tokenize
from itertools import chain
import csv


2. Read the input file using csv reader and generate a list of those tweets

3. Generate a vocabulary

vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))


4. Generate training data

feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]

5. Train the classifier

classifier = nbc.train(feature_set)

6. Generate output csv file

writer = csv.writer(csvoutput, lineterminator='\n')

7. Generate Test Input

featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}

8. Classfiy and create output data

row.append(classifier.classify(featurized_test_sentence))
all.append(row)

9. Flush output data to an output csv file

writer.writerows(all)