In [40]:
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from colorspacious import cspace_converter


from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans



from sklearn import preprocessing
from sklearn.cluster import KMeans
import seaborn as sns

import pandas as pd
import sklearn
from sklearn.cluster import KMeans
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
## For Stemming
import os
import re   ## for regular expressions
import matplotlib.cm as cm
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
import sklearn
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as hc
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
In [18]:
My_Orig_DF=pd.read_csv("Labeled_Headline_Data_from_API.csv", sep=',', index_col=0)
In [19]:
SS_dist = []

values_for_k=range(2,20)
#print(values_for_k)DF_TF

for k_val in values_for_k:
    print(k_val)
    k_means = KMeans(n_clusters=k_val)
    model = k_means.fit(DF_TF)
    SS_dist.append(k_means.inertia_)
    
print(SS_dist)
print(values_for_k)

plt.plot(values_for_k, SS_dist, 'bx-')
plt.xlabel('value')
plt.ylabel('Sum of squared distances')
plt.title('Elbow method for optimal k Choice')
plt.show() 
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
[430.03478260869576, 399.2045314900153, 380.29232531500566, 359.44962406015054, 339.8267068273091, 313.30691158156924, 302.4181224004754, 286.2261178861788, 271.37209784411266, 254.92000000000013, 237.8405797101449, 235.52072072072056, 222.93974358974356, 218.55036231884057, 208.72857142857154, 201.23740310077514, 190.09085213032589, 185.74920634920636]
range(2, 20)
In [20]:
#####
# KMEANS
# Use k-means clustering on the data.

# Create clusters 
k = 10
## Sklearn required you to instantiate first
kmeans = KMeans(n_clusters=k)
kmeans.fit(DF_TF)   ## run kmeans

labels = kmeans.labels_
print(labels)

centroids = kmeans.cluster_centers_
print(centroids)

prediction = kmeans.predict(DF_TF)
print(prediction)
[6 6 6 4 9 3 6 6 6 6 1 6 6 6 6 1 6 6 6 1 6 6 3 3 1 6 6 6 1 6 6 3 6 6 6 6 2
 6 6 6 1 6 6 6 6 6 6 2 6 6 6 6 1 6 6 6 6 6 6 6 6 8 6 6 6 1 1 2 9 6 6 1 6 6
 6 6 6 6 6 4 6 6 6 6 2 6 6 6 0 6 6 7 0 0 6 6 6 6 5 0 8 4 6 4 6 6 6 6 6 5 0
 6 6 6 6 6 6 6 6 6]
[[ 0.00000000e+00 -6.93889390e-18  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00 -6.93889390e-18
  -1.38777878e-17  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  2.00000000e-01  0.00000000e+00
   0.00000000e+00  1.40000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  4.00000000e-01  1.00000000e+00 -2.77555756e-17
   0.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e-01
  -6.93889390e-18  2.00000000e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  4.00000000e-01 -1.38777878e-17
   0.00000000e+00  0.00000000e+00  6.00000000e-01  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00 -1.38777878e-17 -6.93889390e-18  2.00000000e-01
   0.00000000e+00  1.00000000e+00]
 [ 6.93889390e-18 -6.93889390e-18 -6.93889390e-18 -1.38777878e-17
  -6.93889390e-18  0.00000000e+00  4.00000000e-01 -6.93889390e-18
   4.00000000e-01  6.93889390e-18  1.00000000e-01 -6.93889390e-18
   1.00000000e-01 -6.93889390e-18 -6.93889390e-18  6.93889390e-18
   2.00000000e-01 -1.38777878e-17  1.00000000e-01  6.93889390e-18
  -6.93889390e-18  1.00000000e-01 -1.38777878e-17  2.77555756e-17
   2.00000000e-01 -6.93889390e-18  1.00000000e-01  0.00000000e+00
  -6.93889390e-18  1.00000000e-01  6.93889390e-18  1.00000000e-01
  -1.38777878e-17  6.93889390e-18 -6.93889390e-18  1.00000000e-01
   6.93889390e-18  1.00000000e-01 -6.93889390e-18 -6.93889390e-18
  -6.93889390e-18 -6.93889390e-18  1.38777878e-17 -6.93889390e-18
   6.93889390e-18  1.00000000e+00  1.00000000e-01 -6.93889390e-18
   3.00000000e-01  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  2.50000000e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  4.50000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  2.50000000e-01
   1.00000000e+00  0.00000000e+00  1.25000000e+00  5.00000000e-01
   0.00000000e+00  0.00000000e+00  0.00000000e+00  2.50000000e-01
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  2.50000000e-01  0.00000000e+00
   0.00000000e+00  2.50000000e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 7.50000000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  2.50000000e-01  0.00000000e+00
   2.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   5.00000000e-01  5.00000000e-01  0.00000000e+00  5.00000000e-01
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  2.50000000e-01  0.00000000e+00  0.00000000e+00
   2.50000000e-01  2.50000000e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  5.00000000e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  7.50000000e-01]
 [ 0.00000000e+00  5.00000000e-01  0.00000000e+00  0.00000000e+00
   0.00000000e+00  2.50000000e-01  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  2.50000000e-01
   2.50000000e-01  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  5.00000000e-01  0.00000000e+00  2.50000000e-01
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   7.50000000e-01  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  2.50000000e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   3.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 3.48837209e-02  5.81395349e-02  5.81395349e-02  6.97674419e-02
   5.81395349e-02  1.16279070e-01  1.16279070e-02  3.48837209e-02
   2.32558140e-02  6.97674419e-02  4.65116279e-02  4.85722573e-17
   1.38777878e-17  5.81395349e-02  4.65116279e-02  6.97674419e-02
   1.16279070e-02  1.16279070e-02  8.13953488e-02  4.65116279e-02
   5.81395349e-02  3.48837209e-02  3.48837209e-02  2.32558140e-02
   3.48837209e-02  3.48837209e-02  2.32558140e-02  1.04651163e-01
   2.32558140e-02  1.16279070e-02  9.02056208e-17  1.04651163e-01
   8.13953488e-02  5.81395349e-02  3.48837209e-02  5.81395349e-02
   4.65116279e-02  3.48837209e-02  2.32558140e-02  1.16279070e-02
   5.81395349e-02  2.32558140e-02  1.27906977e-01  3.48837209e-02
   1.16279070e-02 -8.32667268e-17  4.65116279e-02  2.32558140e-02
   3.48837209e-02  2.20930233e-01]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  5.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   2.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  3.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  2.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  2.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  1.00000000e+00  1.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00  1.00000000e+00  0.00000000e+00
   1.00000000e+00  0.00000000e+00]]
[6 6 6 4 9 3 6 6 6 6 1 6 6 6 6 1 6 6 6 1 6 6 3 3 1 6 6 6 1 6 6 3 6 6 6 6 2
 6 6 6 1 6 6 6 6 6 6 2 6 6 6 6 1 6 6 6 6 6 6 6 6 8 6 6 6 1 1 2 9 6 6 1 6 6
 6 6 6 6 6 4 6 6 6 6 2 6 6 6 0 6 6 7 0 0 6 6 6 6 5 0 8 4 6 4 6 6 6 6 6 5 0
 6 6 6 6 6 6 6 6 6]
In [21]:
DF_TF_normalized=(DF_TF - DF_TF.mean()) / DF_TF.std()
print(DF_TF_normalized)
                 activist  american  announced     asian    awards     black  \
LABEL                                                                          
black youth     -0.228458 -0.247852  -0.207644 -0.250527 -0.207644 -0.329510   
black youth     -0.228458 -0.247852  -0.207644 -0.250527 -0.207644  2.306568   
black youth     -0.228458 -0.247852  -0.207644 -0.250527 -0.207644  2.306568   
black youth     -0.228458 -0.247852  -0.207644 -0.250527 -0.207644  2.306568   
black youth     -0.228458 -0.247852  -0.207644 -0.250527 -0.207644  4.942645   
...                   ...       ...        ...       ...       ...       ...   
immigrant youth -0.228458 -0.247852  -0.207644 -0.250527 -0.207644 -0.329510   
immigrant youth -0.228458 -0.247852  -0.207644 -0.250527 -0.207644 -0.329510   
immigrant youth -0.228458 -0.247852  -0.207644  2.755796 -0.207644 -0.329510   
immigrant youth -0.228458 -0.247852  -0.207644 -0.250527 -0.207644 -0.329510   
immigrant youth -0.228458 -0.247852  -0.207644 -0.250527 -0.207644 -0.329510   

                   change      city   climate     colin  ...  thousands  \
LABEL                                                    ...              
black youth     -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   
black youth     -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   
black youth     -0.175129 -0.247852 -0.282007  6.830024  ...  -0.174411   
black youth     -0.175129  4.001042 -0.282007 -0.175129  ...  -0.174411   
black youth     -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   
...                   ...       ...       ...       ...  ...        ...   
immigrant youth -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   
immigrant youth -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   
immigrant youth -0.175129 -0.247852 -0.282007 -0.175129  ...   4.011455   
immigrant youth -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   
immigrant youth -0.175129 -0.247852 -0.282007 -0.175129  ...  -0.174411   

                 transgender     white     women   working     world  \
LABEL                                                                  
black youth        -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   
black youth        -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   
black youth        -0.138362  2.987474 -0.207644 -0.228458 -0.316349   
black youth        -0.138362 -0.331942 -0.207644  4.340699 -0.316349   
black youth        -0.138362 -0.331942 -0.207644  4.340699 -0.316349   
...                      ...       ...       ...       ...       ...   
immigrant youth    -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   
immigrant youth    -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   
immigrant youth    -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   
immigrant youth    -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   
immigrant youth    -0.138362 -0.331942 -0.207644 -0.228458 -0.316349   

                     year      york     young     youth  
LABEL                                                    
black youth     -0.247852 -0.247852 -0.236365  3.800502  
black youth     -0.247852 -0.247852 -0.236365 -0.563711  
black youth     -0.247852 -0.247852 -0.236365 -0.563711  
black youth     -0.247852  4.001042 -0.236365  1.618395  
black youth      4.001042 -0.247852  3.309114 -0.563711  
...                   ...       ...       ...       ...  
immigrant youth -0.247852 -0.247852 -0.236365 -0.563711  
immigrant youth -0.247852 -0.247852 -0.236365 -0.563711  
immigrant youth -0.247852 -0.247852 -0.236365 -0.563711  
immigrant youth -0.247852 -0.247852 -0.236365 -0.563711  
immigrant youth -0.247852 -0.247852 -0.236365 -0.563711  

[120 rows x 50 columns]
In [26]:
print(DF_TF_normalized.shape[0])   ## num rows
print(DF_TF_normalized.shape[1])   ## num cols

NumCols=DF_TF_normalized.shape[1]
# ## Instantiated my own copy of PCA
My_pca = PCA(n_components=2)  ## I want the two prin columns

## Transpose it
DF_TF_normalized=np.transpose(DF_TF_normalized)
My_pca.fit(DF_TF_normalized)
50
120
Out[26]:
PCA(n_components=2)
In [27]:
## Transpose it
DF_TF_normalized=np.transpose(DF_TF_normalized)
My_pca.fit(DF_TF_normalized)

print(My_pca)
print(My_pca.components_.T)
# KnownLabels=["solar", "nuclear", "fossil", "hydro" ]

# Reformat and view results
Comps = pd.DataFrame(My_pca.components_.T,
                        columns=['PC%s' % _ for _ in range(2)],
                        index=DF_TF_normalized.columns
                        )
print(Comps)
print(Comps.iloc[:,0])
RowNames = list(Comps.index)
KnownLabels=RowNames
#print(RowNames)
PCA(n_components=2)
[[ 6.95460956e-03  2.03252633e-02]
 [-3.64424277e-02 -2.27391864e-02]
 [-3.42955479e-02 -1.22161452e-01]
 [-7.71286333e-02 -1.95946718e-01]
 [-1.75272393e-01 -3.61085228e-01]
 [ 3.01115645e-01  8.00544307e-02]
 [-5.48581916e-02 -3.76085352e-02]
 [-1.43276064e-02 -2.05214956e-02]
 [ 5.55897579e-02  7.64604633e-03]
 [ 7.10409376e-03  1.17929317e-02]
 [ 1.14243782e-01  8.99058049e-03]
 [-7.24756597e-02 -5.51631850e-02]
 [-9.98734755e-03  9.35745978e-03]
 [-3.30480813e-02 -5.79156234e-02]
 [-4.34691681e-03 -2.55638253e-02]
 [ 9.23535898e-03  1.25930469e-02]
 [-3.64606451e-03  3.08026639e-02]
 [-3.56755303e-02 -3.65109934e-02]
 [ 4.56609939e-02  2.49178662e-02]
 [-6.73444029e-03 -1.56382268e-02]
 [-1.38642332e-02  1.57198636e-03]
 [-3.42955479e-02 -1.22161452e-01]
 [ 4.14323991e-01  3.02336056e-02]
 [ 3.45319561e-01  2.52173786e-02]
 [-3.26849291e-02 -3.88271708e-02]
 [-7.24756597e-02 -5.51631850e-02]
 [ 3.06972974e-03 -8.64832525e-02]
 [-3.64424277e-02 -2.27391864e-02]
 [ 9.23535898e-03  1.25930469e-02]
 [ 3.72943733e-02  1.85187861e-02]
 [ 5.55897579e-02  7.64604633e-03]
 [ 3.01115645e-01  8.00544307e-02]
 [-5.48581916e-02 -3.76085352e-02]
 [-4.91039121e-02 -1.11048326e-01]
 [-2.51616643e-02 -5.62453220e-02]
 [ 7.10409376e-03  1.17929317e-02]
 [-1.96746469e-01  2.77923245e-01]
 [-2.88460250e-02 -5.02228284e-02]
 [-6.65796202e-06  4.31513007e-03]
 [-3.56755303e-02 -3.65109934e-02]
 [ 1.14243782e-01  8.99058049e-03]
 [-5.48581916e-02 -3.76085352e-02]
 [-1.30225541e-02 -1.73343232e-02]
 [-3.24709636e-02 -6.27189230e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 1.47070020e-03  5.45303726e-03]
 [-2.59571468e-02 -1.88738139e-02]
 [-1.46646526e-01  1.94906851e-01]
 [ 4.18546808e-03 -3.47715378e-02]
 [ 2.58564937e-02  3.14500617e-02]
 [ 2.58564937e-02  3.14500617e-02]
 [-1.55993507e-02  1.32875906e-03]
 [-3.83501724e-03 -6.39488975e-02]
 [ 1.47070020e-03  5.45303726e-03]
 [-6.65796202e-06  4.31513007e-03]
 [-6.65796202e-06  4.31513007e-03]
 [-6.36974970e-03 -1.76335761e-02]
 [ 2.90442839e-02  2.86127331e-02]
 [-9.74188767e-03 -3.21329946e-02]
 [ 3.77461273e-03  1.75677878e-02]
 [-9.20862967e-03 -1.82474142e-03]
 [-7.45528033e-02  5.53584659e-02]
 [ 3.14043723e-02 -6.21497931e-03]
 [ 1.59313470e-02  1.80927552e-02]
 [ 4.56609939e-02  2.49178662e-02]
 [ 1.40814322e-01  3.29761198e-02]
 [-2.12925683e-02 -2.70249250e-02]
 [-1.98686228e-01  2.05428463e-01]
 [-1.75272393e-01 -3.61085228e-01]
 [-3.64606451e-03  3.08026639e-02]
 [ 1.93459431e-01  3.51419999e-02]
 [ 2.29211738e-01  6.12162476e-02]
 [-1.94550675e-02 -4.10694697e-02]
 [ 2.72548339e-02 -1.83446861e-03]
 [ 3.85502422e-02  1.74400646e-02]
 [-1.99520824e-02 -9.14974494e-03]
 [-6.65796202e-06  4.31513007e-03]
 [-2.18347227e-03  2.37655989e-02]
 [-2.63113594e-03 -2.46551929e-02]
 [-5.50450274e-02 -2.11090637e-01]
 [ 3.10248945e-03  1.57653262e-02]
 [ 3.85502422e-02  1.74400646e-02]
 [-6.95135578e-02  1.05747027e-01]
 [-2.75322011e-03  3.00860566e-02]
 [-1.40852067e-01  1.93275499e-01]
 [-1.11505355e-02 -2.34321954e-02]
 [-6.51654832e-02  9.77492982e-02]
 [-2.88460250e-02 -5.02228284e-02]
 [ 1.33223886e-02  3.46933240e-02]
 [ 8.18472423e-02 -3.42070324e-02]
 [ 1.02273421e-02  1.28921769e-01]
 [-3.08544347e-01  4.48948080e-01]
 [ 2.69679983e-02  5.16592053e-02]
 [ 2.69679983e-02  5.16592053e-02]
 [-1.96993261e-02 -4.23201354e-02]
 [ 2.90442839e-02  2.86127331e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 2.57886609e-02 -2.00369671e-04]
 [ 1.43560595e-02  4.98807829e-02]
 [ 6.26232134e-02  4.47660633e-02]
 [-7.45528033e-02  5.53584659e-02]
 [ 4.79680136e-02 -7.13213488e-02]
 [-1.59764572e-02 -2.39161436e-02]
 [-5.50450274e-02 -2.11090637e-01]
 [-2.03448712e-02 -1.70098538e-02]
 [-2.04152798e-02  6.29980291e-02]
 [-1.99520824e-02 -9.14974494e-03]
 [-1.30225541e-02 -1.73343232e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 1.43560595e-02  4.98807829e-02]
 [ 3.85509554e-03 -1.66776321e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 1.20760774e-03 -1.28591558e-02]
 [-2.28084735e-02 -5.37703316e-02]
 [-6.65796202e-06  4.31513007e-03]
 [-2.75613941e-02  4.02173884e-02]
 [-6.65796202e-06  4.31513007e-03]
 [-2.18060226e-02 -4.92982283e-02]
 [-7.44282081e-02  1.12136236e-01]
 [-6.65796202e-06  4.31513007e-03]]
                      PC0       PC1
LABEL                              
black youth      0.006955  0.020325
black youth     -0.036442 -0.022739
black youth     -0.034296 -0.122161
black youth     -0.077129 -0.195947
black youth     -0.175272 -0.361085
...                   ...       ...
immigrant youth -0.027561  0.040217
immigrant youth -0.000007  0.004315
immigrant youth -0.021806 -0.049298
immigrant youth -0.074428  0.112136
immigrant youth -0.000007  0.004315

[120 rows x 2 columns]
LABEL
black youth        0.006955
black youth       -0.036442
black youth       -0.034296
black youth       -0.077129
black youth       -0.175272
                     ...   
immigrant youth   -0.027561
immigrant youth   -0.000007
immigrant youth   -0.021806
immigrant youth   -0.074428
immigrant youth   -0.000007
Name: PC0, Length: 120, dtype: float64
In [28]:
plt.figure(figsize=(12,12))
plt.scatter(Comps.iloc[:,0], Comps.iloc[:,1], s=100)#, color="green")

plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.title("Scatter Plot After Perfoming PCA",fontsize=15)
for i, label in enumerate(KnownLabels):
    #print(i)
    #print(label)
    plt.annotate(label, (Comps.iloc[i,0], Comps.iloc[i,1]), fontsize=5)

plt.show()
In [29]:
type(Comps)
Comps.shape

from sklearn.cluster import KMeans
import numpy as np

k = 10 ##set the amount of clusters to be made

kmeans = KMeans(n_clusters=k) # initialize the class object

labels = kmeans.fit_predict(Comps) # predict the centers for all points
# print(labels)
labels_unique = np.unique(labels)

## plotting the results
colors = cm.rainbow(np.linspace(0, 1, k)) #make different colors for each cluster(here 10 colors are made)
Comps=np.array(Comps) # change to numpy array
centers = kmeans.cluster_centers_
for i, c in zip(labels_unique, colors):
    plt.scatter(Comps[labels == i, 0], Comps[labels == i, 1], label = i, color=c)
for i, label in enumerate(KnownLabels):
    plt.annotate(label, (Comps[i,0], Comps[i,1]), fontsize=5)
# plt.scatter(centers[:,0] , centers[:,1] , s = 80, color = 'k')
plt.legend(loc='upper right', bbox_to_anchor=(1.13, 1))
plt.title('KMeans Clustering with K=10')
plt.show()
In [30]:
 #DBSCAN
##
###############################################


MyDBSCAN = DBSCAN(eps=0.1, min_samples=2)
## eps:
    ## The maximum distance between two samples for 
    ##one to be considered as in the neighborhood of the other.
MyDBSCAN.fit_predict(DF_TF)
print(MyDBSCAN.labels_)
[-1  0  1 -1  2  3  4 -1  5  6  7  8 -1 -1 -1  9 10 11 12 -1 -1  1 -1 -1
 -1  8 -1  0  9 -1  5  3  4 -1 -1  6 -1 13 14 11  7  4 15 -1 14 16 -1 -1
 -1 17 17 -1 -1 16 14 14 -1 18 -1 -1 -1 19 -1 -1 12 -1 -1 -1  2 10 -1 -1
 -1 -1 20 21 14 -1 -1 22 -1 20 -1 -1 -1 -1 -1 13 -1 -1 -1 -1 23 23 -1 18
 14 -1 24 -1 19 -1 -1 22 -1 -1 21 15 14 24 -1 14 -1 -1 14 -1 14 -1 -1 14]
In [31]:
range_n_clusters = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
avg_list=[]
print(Comps)

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
    ax1.set_xlim([-0.1, 1])
    # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # plots of individual clusters, to demarcate them clearly.
    ax1.set_ylim([0, len(Comps) + (n_clusters + 1) * 10])

    # Initialize the clusterer with n_clusters value and a random generator
    # seed of 10 for reproducibility.
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(Comps)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed
    # clusters
    # avg_list=[]
    silhouette_avg = silhouette_score(Comps, cluster_labels)
    avg_list.append(silhouette_avg)
    print("For n_clusters =", n_clusters,
          "The average silhouette_score is :", silhouette_avg)


    # Compute the silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(Comps, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        # Aggregate the silhouette scores for samples belonging to
        # cluster i, and sort them
        ith_cluster_silhouette_values = \
            sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = cm.nipy_spectral(float(i) / n_clusters)
        ax1.fill_betweenx(np.arange(y_lower, y_upper),
                          0, ith_cluster_silhouette_values,
                          facecolor=color, edgecolor=color, alpha=0.7)

        # Label the silhouette plots with their cluster numbers at the middle
        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        # Compute the new y_lower for next plot
        y_lower = y_upper + 10  # 10 for the 0 samples

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")

    # The vertical line for average silhouette score of all the values
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax1.set_yticks([])  # Clear the yaxis labels / ticks
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

    # 2nd Plot showing the actual clusters formed
    colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
    ax2.scatter(Comps[:, 0], Comps[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                c=colors, edgecolor='k')

    # Labeling the clusters
    centers = clusterer.cluster_centers_
    # Draw white circles at cluster centers
    ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                c="white", alpha=1, s=200, edgecolor='k')

    for i, c in enumerate(centers):
        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                    s=50, edgecolor='k')

    ax2.set_title("The visualization of the clustered data.")
    ax2.set_xlabel("Feature space for the 1st feature")
    ax2.set_ylabel("Feature space for the 2nd feature")

    plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                  "with n_clusters = %d" % n_clusters),
                 fontsize=14, fontweight='bold')

plt.show()
[[ 6.95460956e-03  2.03252633e-02]
 [-3.64424277e-02 -2.27391864e-02]
 [-3.42955479e-02 -1.22161452e-01]
 [-7.71286333e-02 -1.95946718e-01]
 [-1.75272393e-01 -3.61085228e-01]
 [ 3.01115645e-01  8.00544307e-02]
 [-5.48581916e-02 -3.76085352e-02]
 [-1.43276064e-02 -2.05214956e-02]
 [ 5.55897579e-02  7.64604633e-03]
 [ 7.10409376e-03  1.17929317e-02]
 [ 1.14243782e-01  8.99058049e-03]
 [-7.24756597e-02 -5.51631850e-02]
 [-9.98734755e-03  9.35745978e-03]
 [-3.30480813e-02 -5.79156234e-02]
 [-4.34691681e-03 -2.55638253e-02]
 [ 9.23535898e-03  1.25930469e-02]
 [-3.64606451e-03  3.08026639e-02]
 [-3.56755303e-02 -3.65109934e-02]
 [ 4.56609939e-02  2.49178662e-02]
 [-6.73444029e-03 -1.56382268e-02]
 [-1.38642332e-02  1.57198636e-03]
 [-3.42955479e-02 -1.22161452e-01]
 [ 4.14323991e-01  3.02336056e-02]
 [ 3.45319561e-01  2.52173786e-02]
 [-3.26849291e-02 -3.88271708e-02]
 [-7.24756597e-02 -5.51631850e-02]
 [ 3.06972974e-03 -8.64832525e-02]
 [-3.64424277e-02 -2.27391864e-02]
 [ 9.23535898e-03  1.25930469e-02]
 [ 3.72943733e-02  1.85187861e-02]
 [ 5.55897579e-02  7.64604633e-03]
 [ 3.01115645e-01  8.00544307e-02]
 [-5.48581916e-02 -3.76085352e-02]
 [-4.91039121e-02 -1.11048326e-01]
 [-2.51616643e-02 -5.62453220e-02]
 [ 7.10409376e-03  1.17929317e-02]
 [-1.96746469e-01  2.77923245e-01]
 [-2.88460250e-02 -5.02228284e-02]
 [-6.65796202e-06  4.31513007e-03]
 [-3.56755303e-02 -3.65109934e-02]
 [ 1.14243782e-01  8.99058049e-03]
 [-5.48581916e-02 -3.76085352e-02]
 [-1.30225541e-02 -1.73343232e-02]
 [-3.24709636e-02 -6.27189230e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 1.47070020e-03  5.45303726e-03]
 [-2.59571468e-02 -1.88738139e-02]
 [-1.46646526e-01  1.94906851e-01]
 [ 4.18546808e-03 -3.47715378e-02]
 [ 2.58564937e-02  3.14500617e-02]
 [ 2.58564937e-02  3.14500617e-02]
 [-1.55993507e-02  1.32875906e-03]
 [-3.83501724e-03 -6.39488975e-02]
 [ 1.47070020e-03  5.45303726e-03]
 [-6.65796202e-06  4.31513007e-03]
 [-6.65796202e-06  4.31513007e-03]
 [-6.36974970e-03 -1.76335761e-02]
 [ 2.90442839e-02  2.86127331e-02]
 [-9.74188767e-03 -3.21329946e-02]
 [ 3.77461273e-03  1.75677878e-02]
 [-9.20862967e-03 -1.82474142e-03]
 [-7.45528033e-02  5.53584659e-02]
 [ 3.14043723e-02 -6.21497931e-03]
 [ 1.59313470e-02  1.80927552e-02]
 [ 4.56609939e-02  2.49178662e-02]
 [ 1.40814322e-01  3.29761198e-02]
 [-2.12925683e-02 -2.70249250e-02]
 [-1.98686228e-01  2.05428463e-01]
 [-1.75272393e-01 -3.61085228e-01]
 [-3.64606451e-03  3.08026639e-02]
 [ 1.93459431e-01  3.51419999e-02]
 [ 2.29211738e-01  6.12162476e-02]
 [-1.94550675e-02 -4.10694697e-02]
 [ 2.72548339e-02 -1.83446861e-03]
 [ 3.85502422e-02  1.74400646e-02]
 [-1.99520824e-02 -9.14974494e-03]
 [-6.65796202e-06  4.31513007e-03]
 [-2.18347227e-03  2.37655989e-02]
 [-2.63113594e-03 -2.46551929e-02]
 [-5.50450274e-02 -2.11090637e-01]
 [ 3.10248945e-03  1.57653262e-02]
 [ 3.85502422e-02  1.74400646e-02]
 [-6.95135578e-02  1.05747027e-01]
 [-2.75322011e-03  3.00860566e-02]
 [-1.40852067e-01  1.93275499e-01]
 [-1.11505355e-02 -2.34321954e-02]
 [-6.51654832e-02  9.77492982e-02]
 [-2.88460250e-02 -5.02228284e-02]
 [ 1.33223886e-02  3.46933240e-02]
 [ 8.18472423e-02 -3.42070324e-02]
 [ 1.02273421e-02  1.28921769e-01]
 [-3.08544347e-01  4.48948080e-01]
 [ 2.69679983e-02  5.16592053e-02]
 [ 2.69679983e-02  5.16592053e-02]
 [-1.96993261e-02 -4.23201354e-02]
 [ 2.90442839e-02  2.86127331e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 2.57886609e-02 -2.00369671e-04]
 [ 1.43560595e-02  4.98807829e-02]
 [ 6.26232134e-02  4.47660633e-02]
 [-7.45528033e-02  5.53584659e-02]
 [ 4.79680136e-02 -7.13213488e-02]
 [-1.59764572e-02 -2.39161436e-02]
 [-5.50450274e-02 -2.11090637e-01]
 [-2.03448712e-02 -1.70098538e-02]
 [-2.04152798e-02  6.29980291e-02]
 [-1.99520824e-02 -9.14974494e-03]
 [-1.30225541e-02 -1.73343232e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 1.43560595e-02  4.98807829e-02]
 [ 3.85509554e-03 -1.66776321e-02]
 [-6.65796202e-06  4.31513007e-03]
 [ 1.20760774e-03 -1.28591558e-02]
 [-2.28084735e-02 -5.37703316e-02]
 [-6.65796202e-06  4.31513007e-03]
 [-2.75613941e-02  4.02173884e-02]
 [-6.65796202e-06  4.31513007e-03]
 [-2.18060226e-02 -4.92982283e-02]
 [-7.44282081e-02  1.12136236e-01]
 [-6.65796202e-06  4.31513007e-03]]
For n_clusters = 2 The average silhouette_score is : 0.6551896826021664
For n_clusters = 3 The average silhouette_score is : 0.6875864535048817
For n_clusters = 4 The average silhouette_score is : 0.6952835693106313
For n_clusters = 5 The average silhouette_score is : 0.4243178305606833
For n_clusters = 6 The average silhouette_score is : 0.4344791442845858
For n_clusters = 7 The average silhouette_score is : 0.46828716001592274
For n_clusters = 8 The average silhouette_score is : 0.4749597899885204
For n_clusters = 9 The average silhouette_score is : 0.4900017069827367
For n_clusters = 10 The average silhouette_score is : 0.4239906233701978
For n_clusters = 11 The average silhouette_score is : 0.42955599478228507
For n_clusters = 12 The average silhouette_score is : 0.44008948039193113
For n_clusters = 13 The average silhouette_score is : 0.446532494894326
For n_clusters = 14 The average silhouette_score is : 0.4512934191116046
For n_clusters = 15 The average silhouette_score is : 0.4438946773525304
For n_clusters = 16 The average silhouette_score is : 0.4425762809224361
For n_clusters = 17 The average silhouette_score is : 0.45119872253152804
For n_clusters = 18 The average silhouette_score is : 0.4741566371659779
For n_clusters = 19 The average silhouette_score is : 0.4786529159370846
For n_clusters = 20 The average silhouette_score is : 0.4825990751793564
In [32]:
# ## SILHOUETTE 
#       Make single plot of the silhouette scores
# =============================================================================
plt.plot(range_n_clusters, avg_list, 'bx-')
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Score")
plt.title("Silhouette Plot",fontsize=15)
plt.show()
In [38]:
plt.figure(figsize =(12, 8))
plt.title('Hierarchical Clustering on NEWSAPI Text Data')

dendro = hc.dendrogram((hc.linkage(DF_TF, method ='ward')), leaf_font_size=7, labels=DF_TF.index)
plt.show()
In [ ]: