Datos:

Enlace a drive para los datos

Lo que vamos a necesitar para correr el código

 

import numpy as np
from optparse import OptionParser
import sys
import re
from time import time
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.linear_model import LogisticRegression as LR
from nltk.tokenize import TweetTokenizer
from sklearn import metrics
import random
np.random.seed(42069)
random.seed(69420)

Funciones útiles que no me voy a detener a explicar:

 

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
def extract_topn_from_vector(feature_names, sorted_items, topn):
    
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results
    

Enlace a mis repositorios:

Enlace a la libreta usada en el taller de Hackapalooza