import numpy as np
import pandas as pd
from frovedis.mllib import Word2Vec
from gensim.models import word2vec
from sklearn.cluster import KMeans
from collections import defaultdict
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import csv
import sys, os, time, csv
from frovedis.exrpc.server import FrovedisServer
import warnings

/home/user01/.local/lib/python3.6/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package <https://pypi.org/project/python-Levenshtein/> is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
  warnings.warn(msg)


                          model = word2vec.Word2Vec.load("word2vec.finance_news_cbow500.model")
#max_vocab = 50000000000000
vocab = list(model.wv.key_to_index.keys())[:]
vectors = [model.wv[word] for word in vocab]


                          from frovedis.mllib.cluster import KMeans as frovKM
FrovedisServer.initialize("mpirun -np 2 {}".format(os.environ['FROVEDIS_SERVER']))
t1 = time.time()
f_est = frovKM(n_clusters=8, init='random', algorithm='full', random_state=123, n_init=1).fit(vectors)
t2 = time.time()
FrovedisServer.shut_down()
print("Frovedis KMeans train time: {:.3f} sec".format(t2-t1))
print(set(f_est.labels_))
u, counts = np.unique(np.array(f_est.labels_), return_counts=True)
print(u, counts)

Frovedis KMeans train time: 0.081 sec
{0, 1, 2, 3, 4, 5, 6, 7}
[0 1 2 3 4 5 6 7] [1756 2514 2775 1972 2478 2090 3905 2602]


                          cluster_labels = f_est.labels_
cluster_to_words = defaultdict(list)

for cluster_id, word in zip(cluster_labels, vocab):
    cluster_to_words[cluster_id].append(word)

for i in range(len(cluster_to_words.values())):
    print(list(cluster_to_words.keys())[i], list(cluster_to_words.values())[i][:25])
    print("----------------------------------")

6 ['u.s.', 'market', 'china', 'fed', 'economy', 'covid-19', 'investor', 'economic', 'global', 'growth', 'time', 'business', 'add', 'cut', 'far', 'get', 'think', 'move', 'continue', 'demand', 'people', 'well', 'world', 'policy', 'inflation']
----------------------------------
0 ['year', 'trade', 'rise', 'last', 'week', 'stock', 'month', 'price', 'new', 'expect', 'report', 'point', 'data', 'gain', 'dollar', 'high', 'yield', 'end', 'day', 'increase', 'accord', 'fell', 'close', 'show', 'wednesday']
----------------------------------
3 ['company', 'share', 'firm', 'group', 'announce', 'revenue', 'airline', 'france', 'corp', 'shareholder', 'maker', 'operate', 'stake', 'unit', 'co', 'giant', 'retailer', 'dividend', 'rival', 'electric', 'boeing', 'ltd', 'motor', 'chip', 'spain']
----------------------------------
7 ['bank', 'government', 'include', 'fund', 'plan', 'work', 'debt', 'use', 'issue', 'cost', 'tax', 'loan', 'finance', 'offer', 'source', 'pay', 'purchase', 'provide', 'public', 'benefit', 'program', 'rule', 'cash', 'account', 'reduce']
----------------------------------
2 ['donald trump', 'deal', 'country', 'president', 'tell', 'u.k.', 'meet', 'state', 'e.u.', 'official', 'call', 'talk', 'news', 'comment', 'two', 'joe biden', 'administration', 'statement', 'agreement', 'agree', 'national', 'foreign', 'election', 'italy', 'decision']
----------------------------------
1 ['analyst', 'investment', 'chief', 'financial', 'economist', 'note', 'capital', 'head', 'strategist', 'equity', 'security', 'executive', 'senior', 'research', 'new york', 'manager', 'write', 'strategy', 'director', 'partner', 'management', 'goldman sachs', 'manage', 'portfolio', 'jpmorgan']
----------------------------------
5 ['tariff', 'industry', 'supply', 'american', 'home', 'goods', 'order', 'worker', 'production', 'health', 'export', 'product', 'lockdown', 'international', 'import', 'build', 'restriction', 'travel', 'canada', 'power', 'reopen', 'region', 'city', 'impose', 'producer']
----------------------------------
4 ['service', 'technology', 'value', 'tech', 'list', 'ceo', 'apple', 'invest', 'bitcoin', 'ipo', 'name', 'customer', 'cryptocurrency', 'valuation', 'online', 'amazon', 'tesla', 'launch', 'digital', 'medium', 'platform', 'hedge fund', 'facebook', 'spac', 'model']
----------------------------------


                          from sklearn.cluster import KMeans as skKM
t1 = time.time()
s_est = skKM(n_clusters=8, init='random', algorithm='full', random_state=123, n_init=1).fit(vectors)
t2 = time.time()
print("scikit-learn KMeans train time: {:.3f} sec".format(t2-t1))
print(set(s_est.labels_))

scikit-learn KMeans train time: 0.677 sec
{0, 1, 2, 3, 4, 5, 6, 7}


                          from frovedis.mllib.manifold import TSNE as frovTSNE
FrovedisServer.initialize("mpirun -np 2 " + os.environ["FROVEDIS_SERVER"])
t1 = time.time()
f_est = frovTSNE(n_components=2, method="exact").fit_transform(vectors)
t2 = time.time()
print("Frovedis t-SNE train time: {:.3f} sec".format(t2-t1))
FrovedisServer.shut_down()

Frovedis t-SNE train time: 18.727 sec


                          from sklearn.manifold import TSNE as skTSNE
t1 = time.time()
s_est = skTSNE(n_components=2, method='barnes_hut').fit_transform(vectors)
t2 = time.time()
print("scikit-learn t-SNE train time: {:.3f} sec".format(t2-t1))

scikit-learn t-SNE train time: 198.753 sec


                          FrovedisServer.initialize("mpirun -np 2 " + os.environ["FROVEDIS_SERVER"])
clustered = frovKM(n_clusters=8, init='random', algorithm='full', random_state=123, n_init=1).fit_predict(vectors)
FrovedisServer.shut_down()
fig, ax = plt.subplots(figsize=(30, 30))
cmap = plt.get_cmap('Dark2')
for i in range(f_est.shape[0]):
    cval = cmap(clustered[i] / 4)
    ax.scatter(f_est[i][0], f_est[i][1], marker='.', color=cval)
    ax.annotate(cluster_labels[i], xy=(f_est[i][0], f_est[i][1]), color=cval, fontsize=18)
plt.plot()

[]


                          from frovedis.mllib.cluster import DBSCAN as frovDB
FrovedisServer.initialize("mpirun -np 2 {}".format(os.environ['FROVEDIS_SERVER']))
t1 = time.time()
f_est = frovDB(eps=1.5, metric="euclidean", min_samples=5, algorithm="brute").fit(vectors)
t2 = time.time()
print("Frovedis DBSCAN train time: {:.3f} sec".format(t2-t1))

u, counts = np.unique(np.array(f_est.labels_), return_counts=True)
print("labels_ \n",u)
print("counts_ \n",counts)
cluster_labels = f_est.labels_
cluster_to_words = defaultdict(list)
for cluster_id, word in zip(cluster_labels, vocab):
    cluster_to_words[cluster_id].append(word)
for i in range(len(cluster_to_words.values())):
    print(list(cluster_to_words.keys())[i], list(cluster_to_words.values())[i][:25])
    print("----------------------------------")

FrovedisServer.shut_down()

Frovedis DBSCAN train time: 0.204 sec
labels_ 
 [-1  0  1  2  3  4  5  6  7]
counts_ 
 [11852     5  8195    12     6     8     8     3     3]
-1 ['u.s.', 'year', 'market', 'trade', 'china', 'company', 'bank', 'rise', 'fed', 'last', 'week', 'economy', 'stock', 'covid-19', 'investor', 'price', 'economic', 'new', 'expect', 'share', 'report', 'global', 'growth', 'point', 'data']
----------------------------------
2 ['month', 'march', 'april', 'june', 'july', 'december', 'january', 'september', 'february', 'october', 'august', 'november']
----------------------------------
0 ['wednesday', 'thursday', 'monday', 'tuesday', 'friday']
----------------------------------
1 ['get', 'think', 'well', 'need', 'look', 'start', 'many', 'result', 'way', 'begin', 'mean', 'potential', 'worry', 'suggest', 'thing', 'believe', 'happen', 'possible', 'appear', 'seem', 'come', 'reason', 'probably', 'particularly', 'something']
----------------------------------
3 ['tdk', 'komatsu', 'keyence', 'sumco', 'yuden', 'taiyo']
----------------------------------
4 ['26-week', '13-week', '4-week', '8-week', '119-day', '42-day', '154-day', '105-day']
----------------------------------
5 ['razaqzada', 'fawad', 'forex.com', 'otunuga', 'rhona', 'briesemann', 'lukman', 'oconnell']
----------------------------------
6 ['btc', 'eth', 'utc']
----------------------------------
7 ['wakatabe', 'noguchi', 'masazumi']
----------------------------------


                          from sklearn.cluster import DBSCAN as skDB
t1 = time.time()
s_est = skDB(eps=1.5, metric="euclidean", min_samples=5, algorithm="brute").fit(vectors)
t2 = time.time()
print("scikit-learn DBSCAN train time: {:.3f} sec".format(t2-t1))

u, counts = np.unique(np.array(s_est.labels_), return_counts=True)
print("labels_ \n",u)
print("counts_ \n",counts)
cluster_labels = s_est.labels_
cluster_to_words = defaultdict(list)
for cluster_id, word in zip(cluster_labels, vocab):
    cluster_to_words[cluster_id].append(word)
for i in range(len(cluster_to_words.values())):
    print(list(cluster_to_words.keys())[i], list(cluster_to_words.values())[i][:25])
    print("----------------------------------")

scikit-learn DBSCAN train time: 5.830 sec
labels_ 
 [-1  0  1  2  3  4  5  6  7]
counts_ 
 [11852     5  8195    12     6     8     8     3     3]
-1 ['u.s.', 'year', 'market', 'trade', 'china', 'company', 'bank', 'rise', 'fed', 'last', 'week', 'economy', 'stock', 'covid-19', 'investor', 'price', 'economic', 'new', 'expect', 'share', 'report', 'global', 'growth', 'point', 'data']
----------------------------------
2 ['month', 'march', 'april', 'june', 'july', 'december', 'january', 'september', 'february', 'october', 'august', 'november']
----------------------------------
0 ['wednesday', 'thursday', 'monday', 'tuesday', 'friday']
----------------------------------
1 ['get', 'think', 'well', 'need', 'look', 'start', 'many', 'result', 'way', 'begin', 'mean', 'potential', 'worry', 'suggest', 'thing', 'believe', 'happen', 'possible', 'appear', 'seem', 'come', 'reason', 'probably', 'particularly', 'something']
----------------------------------
3 ['tdk', 'komatsu', 'keyence', 'sumco', 'yuden', 'taiyo']
----------------------------------
4 ['26-week', '13-week', '4-week', '8-week', '119-day', '42-day', '154-day', '105-day']
----------------------------------
5 ['razaqzada', 'fawad', 'forex.com', 'otunuga', 'rhona', 'briesemann', 'lukman', 'oconnell']
----------------------------------
6 ['btc', 'eth', 'utc']
----------------------------------
7 ['wakatabe', 'noguchi', 'masazumi']
----------------------------------

learning algorithm	Frovedis (sec)	scikit-learn(sec)	Ratio
t-SNE	18.73	198.75	x10.6
k-means	0.08	0.68	x8.5
DBSCAN	0.20	5.83	x29.2

Breadcrumb navigation

Frovedis Machine Learning: Unsupervised Learning Dimensionality reduction using clustering and t-SNE Reduced learning time

Unsupervised Learning: Unsupervised Feature Extraction and Clustering Algorithms¶

Clustering by t-SNE and k-means, DBSCAN (Comparison of learning time between scikit-learn version and Frovedis version)¶

Dataset used: Economic news articles vectorized by Word2vec after word segmentation¶

Dataset used: Economic news articles vectorized by Word2vec after word segmentation¶

Clustering with Frovedis k-means¶

Clustering with scikit-learn k-Means¶

Dimensionality reduction using Frovedis t-SNE¶

Dimensionality reduction using scikit-learn t-SNE¶

Graphing data after dimensionality reduction using Frovedis t-SNE¶

Clustering with Frovedis DBSCAN¶

Clustering with scikit-learn DBSCAN¶