Compute the threshold value for clustering using the cosine distance matrix

119 views Asked by At

My idea is to calculate the intra-cluster distance of the current embeddings and other embeddings in every clusters, the cluster with the largest similarity is considered to be the nearest_cluster_idx and the distance is the nearest_cluster_avg_distance, I need help in cChoosing the perfect threshold value

Many Thanks :)

    def cluster_sentences(self):
        base_threshold = -0.25
        threshold_factor = -0.25

        clusters = []
        for i, embedding in enumerate(self.embeddings):
            if not clusters:
                clusters.append([i])
                continue

            min_distance = float('inf')
            nearest_cluster_idx = None
            nearest_cluster_avg_distance = None

            for cluster_idx, cluster in enumerate(clusters):
                centroid_indices = np.array(cluster)
                distance = 1 - self.cosine_similarity_standard[i, centroid_indices].mean()

                if distance < min_distance:
                    min_distance = distance
                    nearest_cluster_idx = cluster_idx
                    nearest_cluster_avg_distance = np.mean(
                        self.cosine_similarity_standard[centroid_indices, centroid_indices])

            adaptive_threshold = base_threshold + threshold_factor * nearest_cluster_avg_distance

            if min_distance < adaptive_threshold:
                clusters[nearest_cluster_idx].append(i)
            else:
                clusters.append([i])
        return clusters
0

There are 0 answers