Monday, 10 May 2021

Remove stop words and predict using Naive Bayes Classifier

 Hi all,

Use this code for NBC which removes stop words

-------------------------------------------------------------------------------

from nltk import NaiveBayesClassifier as nbc


from nltk.tokenize import word_tokenize


from itertools import chain


import csv
from gensim.parsing.preprocessing import remove_stopwords

from nltk.tokenize import word_tokenize



with open('trainingdata.csv','r'as csvinput:


    reader=csv.reader(csvinput,delimiter=",")


    rownum = 0 


    training_data = []



    for row in reader:
      old=row[0]
      sent=remove_stopwords(row[0])
      row[0]=sent
     
      training_data.append (row)
      rownum += 1
      print('hup original ',old)
      print('hup new ',sent)
      print('----------------')



vocabulary = set(chain(*[word_tokenize(i[0].lower()) for i in training_data]))



feature_set = [({i:(i in word_tokenize(sentence.lower())) for i in vocabulary},tag) for sentence, tag in training_data]



classifier = nbc.train(feature_set)



with open('testdata.csv','r'as csvinput:


    with open('data.csv''w'as csvoutput:


        writer = csv.writer(csvoutput, lineterminator='\n')


        reader1 = csv.reader(csvinput)



        all = []


        row = next(reader1)


        



        for row in reader1:


            test_sentence = row[1]


            featurized_test_sentence =  {i:(i in word_tokenize(test_sentence.lower())) for i in vocabulary}


            print ("test_sent:",test_sentence)


            print ("tag:",classifier.classify(featurized_test_sentence))


            row.append(classifier.classify(featurized_test_sentence))


            all.append(row)


        writer.writerows(all)

No comments:

Post a Comment