Hierarchical Clustering Example

Hierarchical Clustering Example

Visualize hierarchical clusters

import os.path import pandas as pd from time import time import requests from graphviz import Digraph dataset_name = "20_newsgroups_3categories" # see list of available datasets BASE_URL = "http://localhost:5001/api/v0" # FreeDiscovery server URL

0. Load the example dataset

url = BASE_URL + '/example-dataset/{}'.format(dataset_name) print(" GET", url) input_ds = requests.get(url).json() # To use a custom dataset, simply specify the following variables data_dir = input_ds['metadata']['data_dir'] dataset_definition = [{'document_id': row['document_id'], 'file_path': os.path.join(data_dir, row['file_path'])} for row in input_ds['dataset']]

Out:

GET http://localhost:5001/api/v0/example-dataset/20_newsgroups_3categories

# 1. Feature extraction (non hashed)

1.a Load dataset and initalize feature extraction

url = BASE_URL + '/feature-extraction' print(" POST", url) fe_opts = {'max_df': 0.6, # filter out (too)/(un)frequent words 'weighting': "ntc", } res = requests.post(url, json=fe_opts).json() dsid = res['id'] print(" => received {}".format(list(res.keys()))) print(" => dsid = {}".format(dsid))

Out:

POST http://localhost:5001/api/v0/feature-extraction
   => received ['id']
   => dsid = f07a8971acf64cab

1.b Run feature extraction

url = BASE_URL+'/feature-extraction/{}'.format(dsid) print(" POST", url) res = requests.post(url, json={'dataset_definition': dataset_definition})

Out:

POST http://localhost:5001/api/v0/feature-extraction/f07a8971acf64cab

2. Calculate LSI

url = BASE_URL + '/lsi/' print("POST", url) n_components = 100 res = requests.post(url, json={'n_components': n_components, 'parent_id': dsid }).json() lsi_id = res['id'] print(' => LSI model id = {}'.format(lsi_id)) print((' => SVD decomposition with {} dimensions ' 'explaining {:.2f} % variabilty of the data') .format(n_components, res['explained_variance']*100))

Out:

POST http://localhost:5001/api/v0/lsi/
  => LSI model id = 84c051086cdd466f
  => SVD decomposition with 100 dimensions explaining 22.26 % variabilty of the data

3. Document Clustering (LSI + Birch Clustering)

3.a. Document clustering (LSI + Birch clustering)

url = BASE_URL + '/clustering/birch/' print(" POST", url) t0 = time() res = requests.post(url, json={'parent_id': lsi_id, 'n_clusters': -1, 'min_similarity': 0.55, #'max_tree_depth': 3, }).json() mid = res['id'] print(" => model id = {}".format(mid)) print("\n4.b. Computing cluster labels") url = BASE_URL + '/clustering/birch/{}'.format(mid) print(" GET", url) res = requests.get(url, json={'n_top_words': 3 }).json() t1 = time() print(' .. computed in {:.1f}s'.format(t1 - t0)) data = res['data'] print(pd.DataFrame(data))

Out:

POST http://localhost:5001/api/v0/clustering/birch/
     => model id = b19c0bf42ea646c3

4.b. Computing cluster labels
 GET http://localhost:5001/api/v0/clustering/birch/b19c0bf42ea646c3
    .. computed in 0.9s
                                             children  cluster_depth  cluster_id                  cluster_label  cluster_similarity  cluster_size                                          documents
0   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...              0           0                edu thanks know               0.108          1967  [{'document_id': 225, 'similarity': 0.11136850...
1                                                  []              1           1             comp image article               0.288           143  [{'document_id': 225, 'similarity': 0.21640175...
2                                                  []              1           2          format phillies files               0.337            93  [{'document_id': 121, 'similarity': 0.37184011...
3                                                  []              1           3  computer information siggraph               0.397           106  [{'document_id': 100, 'similarity': 0.00270706...
4                                                  []              1           4                  ftp data site               0.498            45  [{'document_id': 196, 'similarity': 0.63168856...
5                                                  []              1           5       points algorithm problem               0.253           208  [{'document_id': 144, 'similarity': 0.31973796...
6                                                  []              1           6         screen windows display               0.325           118  [{'document_id': 5329, 'similarity': 0.1170009...
7                                                  []              1           7           new uniforms stadium               0.300           124  [{'document_id': 961, 'similarity': 0.46294582...
8                                                  []              1           8                   bit jays edu               0.300           108  [{'document_id': 3969, 'similarity': 0.4519637...
9                                                  []              1           9           amiga aspects format               0.492            38  [{'document_id': 441, 'similarity': 0.58264766...
10                                                 []              1          10          tiff software library               0.307           136  [{'document_id': 6889, 'similarity': 0.1279159...
11                                                 []              1          11                game year lopez               0.361            93  [{'document_id': 169, 'similarity': 0.44509086...
12                                                 []              1          12              list mailing ball               0.294           129  [{'document_id': 17689, 'similarity': 0.097539...
13                                                 []              1          13         radiosity anybody help               0.337            77  [{'document_id': 361, 'similarity': 0.13379389...
14                                                 []              1          14             braves card driver               0.440            56  [{'document_id': 40401, 'similarity': 0.441856...
15                                                 []              1          15       jewish mattingly kingman               0.353            66  [{'document_id': 16129, 'similarity': 0.327495...
16                                                 []              1          16                 sox thanks red               0.392            52  [{'document_id': 15876, 'similarity': 0.630474...
17                                                 []              1          17                 mode games vga               0.228           189  [{'document_id': 0, 'similarity': 0.2483949485...
18                                                 []              1          18             version jpeg ariel               0.352            45  [{'document_id': 53824, 'similarity': 0.463246...
19                                                 []              1          19               does know thanks               0.278           141  [{'document_id': 2401, 'similarity': 0.3866424...

3.b Hierarchical cluster visualization

ch = Digraph('cluster_hierarchy', node_attr={'shape': 'record'}, format='png') ch.graph_attr['rankdir'] = 'LR' ch.graph_attr['dpi'] = "200" for row in data: ch.node('cluster_{}'.format(row['cluster_id']), '{{{}| {{ id={:03} | N={} | sim={:.2f} }}}}' .format(row['cluster_label'], row['cluster_id'], row['cluster_size'], row['cluster_similarity'])) def create_hc_links(node, ch, data): for child_id in node['children']: ch.edge('cluster_{}:f2'.format(node['cluster_id']), 'cluster_{}:f0'.format(child_id)) create_hc_links(data[child_id], ch, data) create_hc_links(data[0], ch, data) tmp_dir = os.path.join('..', '..', 'doc', 'engine', 'examples') if os.path.exists(tmp_dir): ch.render('cluster_hierarchy', directory=tmp_dir, cleanup=True) else: ch.view()

4. Delete the extracted features

url = BASE_URL + '/feature-extraction/{}'.format(dsid) requests.delete(url)

Total running time of the script: ( 0 minutes 4.112 seconds)