summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaweł Redman <pawel.redman@gmail.com>2020-07-22 16:27:50 +0200
committerPaweł Redman <pawel.redman@gmail.com>2020-07-22 16:27:50 +0200
commitdb0040dbe91729b9bd78c40b4be75068b1e6968a (patch)
tree0a7545088209f124d9767a389cbe7f7306b66d15
parent663122dabf0498ae9522b0f53939e21ab64d052b (diff)
Vastly improve the quality of clusterings.
Setting a minimum cluster size prevents the algorithm from trying to 'cheat' by creating many 3- or 4-colour clusters. This was especially bad when PCA would determine only two basis functions were necessary (the third one being essentially random), causing massive reconstruction errors.
-rw-r--r--demo.py2
-rw-r--r--otsu2018.py85
2 files changed, 69 insertions, 18 deletions
diff --git a/demo.py b/demo.py
index 10e6251..dbaa77d 100644
--- a/demo.py
+++ b/demo.py
@@ -17,7 +17,7 @@ if __name__ == '__main__':
print('Clustering...')
before = clustering.root.total_reconstruction_error()
- clustering.optimise(8)
+ clustering.optimise()
after = clustering.root.total_reconstruction_error()
print('Error before: %g' % before)
diff --git a/otsu2018.py b/otsu2018.py
index e188950..88ab261 100644
--- a/otsu2018.py
+++ b/otsu2018.py
@@ -168,7 +168,7 @@ class Node:
Node._counter += 1
def __str__(self):
- return 'Node #%d' % (self.number)
+ return 'Node #%d (%d)' % (self.number, len(self.colours))
@property
def leaf(self):
@@ -283,7 +283,7 @@ class Node:
if self._cached_reconstruction_error:
return self._cached_reconstruction_error
- if not self.PCA_done: # FIXME
+ if not self.PCA_done:
self.PCA()
error = 0
@@ -333,9 +333,10 @@ class Node:
"""
partition = self.colours.partition(axis)
- if len(partition[0]) < 3 or len(partition[1]) < 3:
+ if (len(partition[0]) < self.clustering.min_cluster_size
+ or len(partition[1]) < self.clustering.min_cluster_size):
raise ClusteringError(
- 'partition created parts that are too small for PCA')
+ 'partition created parts smaller than min_cluster_size')
lesser = Node(self.clustering, partition[0])
lesser.PCA()
@@ -356,24 +357,28 @@ class Node:
print('%s: already optimised' % self)
return self.best_partition
+ error = self.reconstruction_error()
+
best_error = None
bar = tqdm.trange(2 * len(self.colours), leave=False)
for direction in [0, 1]:
for i in range(len(self.colours)):
+ bar.update()
+
origin = self.colours.xy[i, direction]
axis = PartitionAxis(origin, direction)
try:
- error, partition = self.partition_error(axis)
+ new_error, partition = self.partition_error(axis)
except ClusteringError:
continue
- if best_error is None or error < best_error:
- self.best_partition = (error, axis, partition)
+ if new_error >= error:
+ continue
- delta = error - self.reconstruction_error()
- bar.update()
+ if best_error is None or new_error < best_error:
+ self.best_partition = (new_error, axis, partition)
bar.close()
@@ -407,6 +412,29 @@ class Node:
self._plot_colours([0])
plt.legend()
+ def visualise_pca(self):
+ """
+ Makes a plot showing the principal components of this node and how
+ well they reconstruct the source data.
+ """
+
+ plt.subplot(2, 1, 1)
+ plt.title(str(self) + ': principal components')
+ for i in range(3):
+ plt.plot(self.wl, self.basis_functions[i, :], label='PC%d' % i)
+ plt.legend()
+
+ plt.subplot(2, 1, 2)
+ plt.title(str(self) + ': data')
+ for i in range(3):
+ plt.plot(self.wl, self.colours.reflectances[i, :], 'C%d:' % i)
+
+ XYZ = self.colours.XYZ[i, :]
+ recon = self.reconstruct(XYZ)
+ plt.plot(self.wl, recon.values, 'C%d-' % i)
+ recon = self.reconstruct(XYZ)
+ plt.plot(self.wl, recon.values, 'C%d--' % i)
+
class Clustering:
"""
@@ -505,7 +533,7 @@ class Clustering:
node = search(self.root)
return node.reconstruct(XYZ)
- def optimise(self, repeats):
+ def optimise(self, repeats=8, min_cluster_size=None):
"""
Optimise the tree by repeatedly performing optimal partitions of the
nodes, creating a clustering that minimizes the total reconstruction
@@ -513,18 +541,35 @@ class Clustering:
Parameters
----------
- repeats : int
+ repeats : int, optional
Maximum number of splits. If the dataset is too small, this number
- might not be reached.
+ might not be reached. The default is to create 8 clusters, like in
+ the original paper.
+ min_cluster_size : int, optional
+ Smallest acceptable cluster size. By default it's chosen
+ automatically, based on the size of the dataset and desired number
+ of clusters. Must be at least 3 or principal component analysis
+ will not be possible.
"""
+ if min_cluster_size is not None:
+ self.min_cluster_size = min_cluster_size
+ else:
+ self.min_cluster_size = len(self.root.colours) / repeats // 2
+
+ if self.min_cluster_size <= 3:
+ self.min_cluster_size = 3
+
t0 = time.time()
def elapsed():
delta = time.time() - t0
return '%dm%.3fs' % (delta // 60, delta % 60)
+ initial_error = self.root.total_reconstruction_error()
+ print('Initial error is %g.' % initial_error)
+
for repeat in range(repeats):
- print('=== Iteration %d of %d ===' % (repeat + 1, repeats))
+ print('\n=== Iteration %d of %d ===' % (repeat + 1, repeats))
best_total_error = None
total_error = self.root.total_reconstruction_error()
@@ -534,8 +579,8 @@ class Clustering:
try:
error, axis, partition = leaf.find_best_partition()
- except ClusteringError:
- print('...no partitions are possible.')
+ except ClusteringError as e:
+ print('Failed: %s' % e)
continue
new_total_error = (total_error - leaf.reconstruction_error()
@@ -548,11 +593,17 @@ class Clustering:
best_partition = partition
if best_total_error is None:
- print('WARNING: only %d splits were possible.' % repeat)
+ print('\nNo further improvements are possible.\n'
+ 'Terminating at iteration %d.\n' % repeat)
break
- print('\nSplit %s into %s and %s along %s.\n'
+ print('\nSplit %s into %s and %s along %s.'
% (best_leaf, *best_partition, best_axis))
+ print('Error is reduced by %g and is now %g, '
+ '%.1f%% of the initial error.' % (leaf.reconstruction_error()
+ - error, best_total_error, 100 * best_total_error
+ / initial_error))
+
best_leaf.split(best_partition, best_axis)
print('Finished in %s.' % elapsed())