codekarim is just a bunch of reminders for fenuapps.com
# -*- coding: utf-8 -*- """ Naive Bayes Classifier for Multinomial Models @author: K """ import logging import pandas as pd import numpy as np from numpy import random #import gensim import nltk from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.metrics import accuracy_score, confusion_matrix import matplotlib.pyplot as plt from nltk.corpus import stopwords import re from bs4 import BeautifulSoup from IPython import get_ipython get_ipython().run_line_magic('matplotlib', 'inline') import sys, os # */site-packages is where your current session is running its python out of ''' site_path = '' for path in sys.path: if 'site-packages' in path.split('/')[-1]: print(path) site_path = path # search to see if gensim in installed packages if len(site_path) > 0: if not 'gensim' in os.listdir(site_path): print('package not found') else: print('gensim installed') ''' df = pd.read_csv('stack-overflow-data.csv') df = df[pd.notnull(df['tags'])] #print(df.head(10)) #print(df['post'].apply(lambda x: len(x.split(' '))).sum()) my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net'] #plt.figure(figsize=(10,4)) #df.tags.value_counts().plot(kind='bar'); def print_plot(index): example = df[df.index == index][['post', 'tags']].values[0] if len(example) > 0: print(example[0]) print('Tag:', example[1]) #print_plot(10) REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]') BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]') STOPWORDS = set(stopwords.words('english')) def clean_text(text): """ text: a string return: modified initial string """ text = BeautifulSoup(text, "lxml").text # HTML decoding text = text.lower() # lowercase text text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text return text print("Start of cleantext") df['post'] = df['post'].apply(clean_text) #print_plot(10) df['post'].apply(lambda x: len(x.split(' '))).sum() X = df.post y = df.tags X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42) from sklearn.naive_bayes import MultinomialNB from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ]) nb.fit(X_train, y_train) from sklearn.metrics import classification_report y_pred = nb.predict(X_test) print('accuracy %s' % accuracy_score(y_pred, y_test)) print(classification_report(y_test, y_pred,target_names=my_tags))