In [None]:
import numpy as np
import pandas as pd
import sklearn.metrics as sm
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [None]:
d = pd.read_json(r'dataset-hackaton.json', lines=True)
# remove if you have enough memory/cpu.
d = d.sample(n = 10000)
d

In [None]:
plt.figure(figsize=(15, 4))
d.dir.hist()

In [None]:
nltk.download("stopwords")

In [None]:
stemmer = SnowballStemmer("russian")
russian_stopwords = stopwords.words("russian")

def tokenize(text):
    tokens = text.lower().replace('"', '').replace('(', '').replace(')', '').split()
    tokens = [stemmer.stem(t) for t in tokens if t not in russian_stopwords
              and t not in punctuation]
    return tokens

In [None]:
tokenize('Боковая стенка ПРАКТИК MS 200 х 30 [S24199558302')

In [None]:
tfidf = TfidfVectorizer(tokenizer=tokenize)

In [None]:
categ = { d: '' for d in d.dir.unique() }
for r in d.itertuples():
    categ[r.dir] += " " + r.title

In [None]:
categ_names = list(categ.keys())
categ_values = list(categ.values())

In [None]:
tfs = tfidf.fit_transform(categ.values())
tfs.shape

In [None]:
fn = tfidf.get_feature_names()

In [None]:
r = tfidf.transform(['Диван раскладной угловой Каприз'])
for c in r.nonzero()[1]:
    print(fn[c], r[0, c])

In [None]:
nn = NearestNeighbors(n_neighbors=1, metric='cosine')
nn.fit(tfs, categ.values())

In [None]:
n = nn.kneighbors(r[0], return_distance=False)[0]
n

In [None]:
def classify(t):
    res = []
    tr = tfidf.transform(t)
    for r in tr:
        n = nn.kneighbors(r, return_distance=False)[0][0]
        res.append(categ_names[n])
    return res

In [None]:
test = pd.read_json(r'hideout-public.json', lines=True)

In [None]:
a = pd.DataFrame(classify(test.title), columns=['dir'])
a

In [None]:
a.to_json('submission.json', orient='records', lines=True)

In [None]:
sm.f1_score(a.dir, a.dir, average='macro')