Skip to content

saber.cluster

calc_silhouette(workdir, x, n_clusters='all', samples=75000)

Calculate the silhouette score for the given number of clusters

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
x np.ndarray

a numpy array of the prepared FDC data

required
n_clusters

the number of clusters to calculate the silhouette score for

'all'
samples int

the number of samples to use for the silhouette score calculation

75000

Returns:

Type Description
None

None

Source code in saber/cluster.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
def calc_silhouette(workdir: str, x: np.ndarray, n_clusters: int or Iterable = 'all', samples: int = 75_000) -> None:
    """
    Calculate the silhouette score for the given number of clusters

    Args:
        workdir: path to the project directory
        x: a numpy array of the prepared FDC data
        n_clusters: the number of clusters to calculate the silhouette score for
        samples: the number of samples to use for the silhouette score calculation

    Returns:
        None
    """
    if x is None:
        x = read_table(workdir, 'hindcast_fdc_trans').values
    fdc_df = pd.DataFrame(x)

    summary = {'number': [], 'silhouette': []}

    random_shuffler = np.random.default_rng()

    for model_file in _find_model_files(workdir, n_clusters):
        logger.info(f'Calculating Silhouettes for {os.path.basename(model_file)}')
        kmeans = joblib.load(model_file)

        # randomly sample fdcs from each cluster
        fdc_df['label'] = kmeans.labels_
        ss_df = pd.DataFrame(columns=fdc_df.columns.to_list())
        for i in range(int(kmeans.n_clusters)):
            values = fdc_df[fdc_df['label'] == i].drop(columns='label').values
            random_shuffler.shuffle(values)
            values = values[:int(samples)]
            tmp = pd.DataFrame(values)
            tmp['label'] = i
            ss_df = pd.concat([ss_df, tmp])

        # calculate their silhouette scores
        ss_df['silhouette'] = silhouette_samples(ss_df.drop(columns='label').values, ss_df['label'].values, n_jobs=-1)
        ss_df['silhouette'] = ss_df['silhouette'].round(3)
        ss_df.columns = ss_df.columns.astype(str)
        write_table(ss_df, workdir, f'cluster_sscores_{kmeans.n_clusters}')

        # save the summary stats from this model
        summary['number'].append(kmeans.n_clusters)
        summary['silhouette'].append(ss_df['silhouette'].mean())

    # save the summary stats
    write_table(pd.DataFrame(summary), workdir, 'cluster_sscores')
    return

cluster(workdir, x=None, max_clusters=13)

Trains scikit-learn MiniBatchKMeans models and saves as pickle

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
x np.ndarray

a numpy array of the prepared FDC data

None
max_clusters int

maximum number of clusters to train

13

Returns:

Type Description
None

None

Source code in saber/cluster.py
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def cluster(workdir: str, x: np.ndarray = None, max_clusters: int = 13) -> None:
    """
    Trains scikit-learn MiniBatchKMeans models and saves as pickle

    Args:
        workdir: path to the project directory
        x: a numpy array of the prepared FDC data
        max_clusters: maximum number of clusters to train

    Returns:
        None
    """
    if x is None:
        x = read_table(workdir, 'hindcast_fdc_trans').values

    # build the kmeans model for a range of cluster numbers
    for n_clusters in range(2, max_clusters + 1):
        logger.info(f'Clustering n={n_clusters}')
        kmeans = MiniBatchKMeans(n_clusters=n_clusters, init='k-means++', n_init=100)
        kmeans.fit_predict(x)
        joblib.dump(kmeans, os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle'))
    return

plot_centers(workdir, plt_width=2, plt_height=2, max_cols=3)

Plot the cluster centers for each cluster.

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
plt_width int

width of each subplot in inches

2
plt_height int

height of each subplot in inches

2
max_cols int

maximum number of columns of subplots in the figure

3

Returns:

Type Description
None

None

Source code in saber/cluster.py
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
def plot_centers(workdir: str, plt_width: int = 2, plt_height: int = 2, max_cols: int = 3) -> None:
    """
    Plot the cluster centers for each cluster.

    Args:
        workdir: path to the project directory
        plt_width: width of each subplot in inches
        plt_height: height of each subplot in inches
        max_cols: maximum number of columns of subplots in the figure

    Returns:
        None
    """
    logger.info('Plotting Cluster Centers')

    clusters_dir = os.path.join(workdir, 'clusters')

    # count number of files to plot
    centers_files = natsorted(glob.glob(os.path.join(clusters_dir, 'cluster_centers_*.parquet')))
    n_files = len(centers_files)
    n_cols = min(n_files, max_cols)
    n_rows = math.ceil(n_files / n_cols)

    # initialize the figure and labels
    fig, axes = plt.subplots(
        n_rows,
        n_cols,
        figsize=(plt_width * n_cols + 1.25, plt_height * n_rows + 1.25),
        dpi=750,
        squeeze=False,
        tight_layout=True,
        sharey='row',
        sharex='col'
    )

    fig.suptitle('Cluster Centers', fontsize=16)
    fig.supylabel('Discharge Z-Score')
    fig.supxlabel('Exceedance Probability (%)')

    for centers_table, ax in zip(centers_files, fig.axes[:n_files]):
        n_clusters = int(centers_table.split('_')[-1].split('.')[0])
        centers_df = pd.read_parquet(centers_table, engine='fastparquet')

        for i in range(int(n_clusters)):
            ax.plot(centers_df[f'{i}'].values, label=f'Cluster {i + 1}')

        # Plot titles and labels
        ax.set_title(f"k={n_clusters} clusters")
        ax.set_xlim(0, 40)
        ax.set_ylim(-1, 5)

    fig.savefig(os.path.join(clusters_dir, f'figure-cluster-centers.png'))
    plt.close(fig)
    return

plot_clusters(workdir, x=None, n_clusters='all', max_cols=3, plt_width=2, plt_height=2, n_lines=2500)

Generate figures of the clustered FDC's

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
x np.ndarray

a numpy array of the prepared FDC data

None
n_clusters

number of clusters to create figures for

'all'
max_cols int

maximum number of columns (subplots) in the figure

3
plt_width int

width of each subplot in inches

2
plt_height int

height of each subplot in inches

2
n_lines int

max number of lines to plot in each subplot

2500

Returns:

Type Description
None

None

Source code in saber/cluster.py
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def plot_clusters(workdir: str, x: np.ndarray = None, n_clusters: int or Iterable = 'all',
                  max_cols: int = 3, plt_width: int = 2, plt_height: int = 2, n_lines: int = 2_500) -> None:
    """
    Generate figures of the clustered FDC's

    Args:
        workdir: path to the project directory
        x: a numpy array of the prepared FDC data
        n_clusters: number of clusters to create figures for
        max_cols: maximum number of columns (subplots) in the figure
        plt_width: width of each subplot in inches
        plt_height: height of each subplot in inches
        n_lines: max number of lines to plot in each subplot

    Returns:
        None
    """
    if x is None:
        x = read_table(workdir, 'hindcast_fdc_trans').values

    size = x.shape[1]
    x_values = np.linspace(0, size, 5)
    x_ticks = np.linspace(0, 100, 5).astype(int)

    random_shuffler = np.random.default_rng()

    for model_file in _find_model_files(workdir, n_clusters):
        logger.info(f'Plotting Clusters {os.path.basename(model_file)}')

        # load the model and calculate
        kmeans = joblib.load(model_file)
        n_clusters = int(kmeans.n_clusters)
        n_cols = min(n_clusters, max_cols)
        n_rows = math.ceil(n_clusters / n_cols)

        # initialize the figure and labels
        fig, axs = plt.subplots(
            n_rows,
            n_cols,
            figsize=(plt_width * n_cols + 1, plt_height * n_rows + 1),
            dpi=750,
            squeeze=False,
            tight_layout=True,
            sharey='row'
        )
        fig.suptitle("KMeans FDC Clustering")
        fig.supxlabel('Exceedance Probability (%)')
        fig.supylabel('Discharge Z-Score')

        for i, ax in enumerate(fig.axes[:n_clusters]):
            ax.set_title(f'Cluster {i + 1} (n = {np.sum(kmeans.labels_ == i)})')
            ax.set_xlim(0, size)
            ax.set_xticks(x_values, x_ticks)
            ax.set_ylim(-2, 4)
            fdc_sample = x[kmeans.labels_ == i]
            random_shuffler.shuffle(fdc_sample)
            fdc_sample = fdc_sample[:n_lines]
            for j in fdc_sample:
                ax.plot(j.ravel(), "k-")
            ax.plot(kmeans.cluster_centers_[i].flatten(), "r-")
        # turn off plotting axes which are blank (when ax number > n_clusters)
        for ax in fig.axes[n_clusters:]:
            ax.axis('off')

        fig.savefig(os.path.join(workdir, 'clusters', f'figure-clusters-{n_clusters}.png'))
        plt.close(fig)
    return

plot_fit_metrics(workdir, plt_width=5, plt_height=3)

Plot the cluster metrics, inertia and silhouette score, vs number of clusters

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
plt_width int

width of each subplot in inches

5
plt_height int

height of each subplot in inches

3

Returns:

Type Description
None

None

Source code in saber/cluster.py
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
def plot_fit_metrics(workdir: str, plt_width: int = 5, plt_height: int = 3) -> None:
    """
    Plot the cluster metrics, inertia and silhouette score, vs number of clusters

    Args:
        workdir: path to the project directory
        plt_width: width of each subplot in inches
        plt_height: height of each subplot in inches

    Returns:
        None
    """
    logger.info('Plotting Cluster Fit Metrics')

    clusters_dir = os.path.join(workdir, 'clusters')

    df = read_table(workdir, 'cluster_metrics')
    if os.path.exists(get_table_path(workdir, 'cluster_sscores')):
        df = df.merge(read_table(workdir, 'cluster_sscores'), on='number', how='outer')
    df['number'] = df['number'].astype(int)

    # initialize the figure and labels
    fig, ax1 = plt.subplots(
        figsize=(plt_width, plt_height),
        dpi=750,
        tight_layout=True,
    )

    # Plot titles and labels
    ax1.set_title("Clustering Fit Metrics")
    ax1.set_xlabel("Number of Clusters")
    ax1.set_ylabel("Inertia")

    ticks = np.arange(1, df['number'].max() + 2)
    ax1.set_xlim(ticks[0], ticks[-1])
    ax1.set_xticks(ticks)

    # plot the inertia
    knee = int(df['knee'].values[0])
    ax1.plot(df['number'], df['inertia'], marker='o', label='Inertia')
    ax1.plot(knee, df[df['number'] == knee]['inertia'], marker='o', c='red', label='Knee')

    # add the silhouette scores if they were calculated
    if 'silhouette' in df.columns:
        df.loc[df['silhouette'].isna(), 'silhouette'] = ''
        ax2 = ax1.twinx()
        ax2.set_ylabel("Silhouette Score")
        ax2.set_ylim(0, 1)
        ax2.plot(df['number'], df['silhouette'], marker='o', c='green', label='Silhouette Score')

    fig.savefig(os.path.join(clusters_dir, f'figure-fit-metrics.png'))
    plt.close(fig)
    return

plot_silhouettes(workdir, plt_width=3, plt_height=3)

Plot the silhouette scores for each cluster. Based on https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
plt_width int

width of each subplot in inches

3
plt_height int

height of each subplot in inches

3

Returns:

Type Description
None

None

Source code in saber/cluster.py
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
def plot_silhouettes(workdir: str, plt_width: int = 3, plt_height: int = 3) -> None:
    """
    Plot the silhouette scores for each cluster.
    Based on https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html

    Args:
        workdir: path to the project directory
        plt_width: width of each subplot in inches
        plt_height: height of each subplot in inches

    Returns:
        None
    """
    logger.info('Generating Silhouette Diagrams')

    clusters_dir = os.path.join(workdir, 'clusters')

    for sscore_table in natsorted(glob.glob(os.path.join(clusters_dir, 'cluster_sscores_*.parquet'))):
        logger.info(f'Generating Silhouette Diagram: {os.path.basename(sscore_table)}')
        n_clusters = int(sscore_table.split('_')[-1].split('.')[0])
        sscore_df = pd.read_parquet(sscore_table, engine='fastparquet')
        centers_df = read_table(workdir, f'cluster_centers_{n_clusters}')
        mean_ss = sscore_df['silhouette'].mean()

        # initialize the figure
        fig, (ax1, ax2) = plt.subplots(
            nrows=1,
            ncols=2,
            figsize=(plt_width * 2 + 1, plt_height + 1),
            dpi=600,
            tight_layout=True,
        )

        # Plot 1 titles and labels
        ax1.set_title(f"Silhouette Plot (mean={mean_ss:.3f})")
        ax1.set_xlabel("Silhouette Score")
        ax1.set_ylabel("Cluster Label")
        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # Plot 2 titles and labels
        ax2.set_title("Cluster Centers")
        ax2.set_xlabel("Exceedance Probability (%)")
        ax2.set_ylabel("Discharge Z-Score")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=mean_ss, color="red", linestyle="--")

        y_lower = 10
        for sub_cluster in range(int(n_clusters)):
            # select the rows applicable to the current sub cluster
            cluster_sscores = sscore_df[sscore_df['label'] == sub_cluster]['silhouette'].values.flatten()
            cluster_sscores.sort()

            n = cluster_sscores.shape[0]
            y_upper = y_lower + n

            color = cm.nipy_spectral(sub_cluster / int(n_clusters))
            ax1.fill_betweenx(
                np.arange(y_lower, y_upper),
                0,
                cluster_sscores,
                facecolor=color,
                edgecolor=color,
                alpha=0.7,
            )
            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * n, f'{sub_cluster + 1}: n={n:,}')

            # plot the cluster center
            ax2.plot(centers_df[f'{sub_cluster}'].values, alpha=0.7, c=color, label=f'Cluster {sub_cluster + 1}')

            # add some buffer before the next cluster
            y_lower = y_upper + 10

        fig.savefig(os.path.join(clusters_dir, f'figure-silhouette-diagram-{n_clusters}.png'))
        plt.close(fig)
    return

predict_labels(workdir, n_clusters, x)

Predict the cluster labels for a set number of FDCs

Parameters:

Name Type Description Default
workdir str

path to the project directory

required
n_clusters int

number of cluster model to use for prediction

required
x pd.DataFrame

A dataframe with 1 row per FDC (stream) and 1 column per FDC value. Index is the stream's ID.

required

Returns:

Type Description
pd.DataFrame

None

Source code in saber/cluster.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def predict_labels(workdir: str, n_clusters: int, x: pd.DataFrame) -> pd.DataFrame:
    """
    Predict the cluster labels for a set number of FDCs

    Args:
        workdir: path to the project directory
        n_clusters: number of cluster model to use for prediction
        x: A dataframe with 1 row per FDC (stream) and 1 column per FDC value. Index is the stream's ID.

    Returns:
        None
    """
    model = joblib.load(os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle'))
    labels_df = pd.DataFrame(
        np.transpose([model.predict(x.values), x.index]),
        columns=[clbl_col, mid_col]
    )
    write_table(labels_df, workdir, 'cluster_labels')
    return labels_df

summarize_fit(workdir)

Generate a summary of the clustering results save the centers and labels to parquet

Parameters:

Name Type Description Default
workdir str

path to the project directory

required

Returns:

Type Description
None

None

Source code in saber/cluster.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
def summarize_fit(workdir: str) -> None:
    """
    Generate a summary of the clustering results save the centers and labels to parquet

    Args:
        workdir: path to the project directory

    Returns:
        None
    """
    summary = {'number': [], 'inertia': [], 'n_iter': []}
    labels = []

    for model_file in _find_model_files(workdir, n_clusters='all'):
        logger.info(f'Post Processing {os.path.basename(model_file)}')
        kmeans = joblib.load(model_file)
        n_clusters = int(kmeans.n_clusters)
        labels.append(kmeans.labels_.flatten())

        # save cluster centroids to table - columns are the cluster number, rows are the centroid FDC values
        write_table(
            pd.DataFrame(np.transpose(kmeans.cluster_centers_), columns=np.array(range(n_clusters)).astype(str)),
            workdir,
            f'cluster_centers_{n_clusters}'
        )

        # save the summary stats from this model
        summary['number'].append(n_clusters)
        summary['inertia'].append(kmeans.inertia_)
        summary['n_iter'].append(kmeans.n_iter_)

    # save the summary results as a csv
    sum_df = pd.DataFrame(summary)
    sum_df['knee'] = KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee
    write_table(sum_df, workdir, 'cluster_metrics')

    return