From db0040dbe91729b9bd78c40b4be75068b1e6968a Mon Sep 17 00:00:00 2001 From: Paweł Redman Date: Wed, 22 Jul 2020 16:27:50 +0200 Subject: Vastly improve the quality of clusterings. Setting a minimum cluster size prevents the algorithm from trying to 'cheat' by creating many 3- or 4-colour clusters. This was especially bad when PCA would determine only two basis functions were necessary (the third one being essentially random), causing massive reconstruction errors. --- demo.py | 2 +- otsu2018.py | 85 ++++++++++++++++++++++++++++++++++++++++++++++++------------- 2 files changed, 69 insertions(+), 18 deletions(-) diff --git a/demo.py b/demo.py index 10e6251..dbaa77d 100644 --- a/demo.py +++ b/demo.py @@ -17,7 +17,7 @@ if __name__ == '__main__': print('Clustering...') before = clustering.root.total_reconstruction_error() - clustering.optimise(8) + clustering.optimise() after = clustering.root.total_reconstruction_error() print('Error before: %g' % before) diff --git a/otsu2018.py b/otsu2018.py index e188950..88ab261 100644 --- a/otsu2018.py +++ b/otsu2018.py @@ -168,7 +168,7 @@ class Node: Node._counter += 1 def __str__(self): - return 'Node #%d' % (self.number) + return 'Node #%d (%d)' % (self.number, len(self.colours)) @property def leaf(self): @@ -283,7 +283,7 @@ class Node: if self._cached_reconstruction_error: return self._cached_reconstruction_error - if not self.PCA_done: # FIXME + if not self.PCA_done: self.PCA() error = 0 @@ -333,9 +333,10 @@ class Node: """ partition = self.colours.partition(axis) - if len(partition[0]) < 3 or len(partition[1]) < 3: + if (len(partition[0]) < self.clustering.min_cluster_size + or len(partition[1]) < self.clustering.min_cluster_size): raise ClusteringError( - 'partition created parts that are too small for PCA') + 'partition created parts smaller than min_cluster_size') lesser = Node(self.clustering, partition[0]) lesser.PCA() @@ -356,24 +357,28 @@ class Node: print('%s: already optimised' % self) return self.best_partition + error = self.reconstruction_error() + best_error = None bar = tqdm.trange(2 * len(self.colours), leave=False) for direction in [0, 1]: for i in range(len(self.colours)): + bar.update() + origin = self.colours.xy[i, direction] axis = PartitionAxis(origin, direction) try: - error, partition = self.partition_error(axis) + new_error, partition = self.partition_error(axis) except ClusteringError: continue - if best_error is None or error < best_error: - self.best_partition = (error, axis, partition) + if new_error >= error: + continue - delta = error - self.reconstruction_error() - bar.update() + if best_error is None or new_error < best_error: + self.best_partition = (new_error, axis, partition) bar.close() @@ -407,6 +412,29 @@ class Node: self._plot_colours([0]) plt.legend() + def visualise_pca(self): + """ + Makes a plot showing the principal components of this node and how + well they reconstruct the source data. + """ + + plt.subplot(2, 1, 1) + plt.title(str(self) + ': principal components') + for i in range(3): + plt.plot(self.wl, self.basis_functions[i, :], label='PC%d' % i) + plt.legend() + + plt.subplot(2, 1, 2) + plt.title(str(self) + ': data') + for i in range(3): + plt.plot(self.wl, self.colours.reflectances[i, :], 'C%d:' % i) + + XYZ = self.colours.XYZ[i, :] + recon = self.reconstruct(XYZ) + plt.plot(self.wl, recon.values, 'C%d-' % i) + recon = self.reconstruct(XYZ) + plt.plot(self.wl, recon.values, 'C%d--' % i) + class Clustering: """ @@ -505,7 +533,7 @@ class Clustering: node = search(self.root) return node.reconstruct(XYZ) - def optimise(self, repeats): + def optimise(self, repeats=8, min_cluster_size=None): """ Optimise the tree by repeatedly performing optimal partitions of the nodes, creating a clustering that minimizes the total reconstruction @@ -513,18 +541,35 @@ class Clustering: Parameters ---------- - repeats : int + repeats : int, optional Maximum number of splits. If the dataset is too small, this number - might not be reached. + might not be reached. The default is to create 8 clusters, like in + the original paper. + min_cluster_size : int, optional + Smallest acceptable cluster size. By default it's chosen + automatically, based on the size of the dataset and desired number + of clusters. Must be at least 3 or principal component analysis + will not be possible. """ + if min_cluster_size is not None: + self.min_cluster_size = min_cluster_size + else: + self.min_cluster_size = len(self.root.colours) / repeats // 2 + + if self.min_cluster_size <= 3: + self.min_cluster_size = 3 + t0 = time.time() def elapsed(): delta = time.time() - t0 return '%dm%.3fs' % (delta // 60, delta % 60) + initial_error = self.root.total_reconstruction_error() + print('Initial error is %g.' % initial_error) + for repeat in range(repeats): - print('=== Iteration %d of %d ===' % (repeat + 1, repeats)) + print('\n=== Iteration %d of %d ===' % (repeat + 1, repeats)) best_total_error = None total_error = self.root.total_reconstruction_error() @@ -534,8 +579,8 @@ class Clustering: try: error, axis, partition = leaf.find_best_partition() - except ClusteringError: - print('...no partitions are possible.') + except ClusteringError as e: + print('Failed: %s' % e) continue new_total_error = (total_error - leaf.reconstruction_error() @@ -548,11 +593,17 @@ class Clustering: best_partition = partition if best_total_error is None: - print('WARNING: only %d splits were possible.' % repeat) + print('\nNo further improvements are possible.\n' + 'Terminating at iteration %d.\n' % repeat) break - print('\nSplit %s into %s and %s along %s.\n' + print('\nSplit %s into %s and %s along %s.' % (best_leaf, *best_partition, best_axis)) + print('Error is reduced by %g and is now %g, ' + '%.1f%% of the initial error.' % (leaf.reconstruction_error() + - error, best_total_error, 100 * best_total_error + / initial_error)) + best_leaf.split(best_partition, best_axis) print('Finished in %s.' % elapsed()) -- cgit