From e049784a37ec52078483feb13c9154be1afb4c40 Mon Sep 17 00:00:00 2001
From: Zizhe Wang <zizhe.wang@tu-dresden.de>
Date: Wed, 12 Jun 2024 11:47:24 +0200
Subject: [PATCH] fix adaptive instance selection

---
 src/adaptive_instance_selection.py |  78 +++++++++++++-------
 src/optimize_main.py               | 112 ++++++++++++++++++++++-------
 2 files changed, 138 insertions(+), 52 deletions(-)

diff --git a/src/adaptive_instance_selection.py b/src/adaptive_instance_selection.py
index ebbe4f5..8f07c84 100644
--- a/src/adaptive_instance_selection.py
+++ b/src/adaptive_instance_selection.py
@@ -7,9 +7,9 @@
 #                                  #
 ####################################
 
-from sklearn.mixture import GaussianMixture
-from scipy.stats.qmc import LatinHypercube as lhs
 import numpy as np
+from sklearn.cluster import KMeans
+from scipy.stats.qmc import LatinHypercube as lhs
 from config import PARAM_TYPES, PARAM_BOUNDS
 
 # Initial Sampling
@@ -41,38 +41,60 @@ def evaluate_samples(samples, objective_function):
 def advanced_clustering_samples(samples, n_clusters):
     if len(samples) == 0:
         raise ValueError("Cannot cluster an empty set of samples.")
-    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full')
-    gmm.fit(samples)
-    labels = gmm.predict(samples)
-    centers = gmm.means_
+    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
+    kmeans.fit(samples)
+    labels = kmeans.predict(samples)
+    centers = kmeans.cluster_centers_
     
     return labels, centers
 
 # Adaptive Selection with Adaptive Threshold
-def adaptive_select_informative_instances(samples, results, initial_threshold=0.15, adapt_rate=0.05):
+def adaptive_select_informative_instances(samples, results, initial_threshold=0.05, adapt_rate=0.01, desired_samples=None, max_iterations=100):
     if len(samples) == 0 or len(results) == 0:
         raise ValueError("Received empty samples or results for selection.")
     
-    performance = np.mean(results, axis=1)
+    performance = np.nanmean(results, axis=1)  # Use np.nanmean to ignore nan values
     threshold = initial_threshold
-    while True:
-        # Ensure the threshold does not exceed 1
+    iteration = 0
+
+    while iteration < max_iterations:
+        iteration += 1
+        print(f"Iteration {iteration}: Current threshold: {threshold}")
+
+        # Cap the threshold at 1.0
         effective_threshold = min(threshold, 1.0)
-        cutoff = np.percentile(performance, effective_threshold * 100)
+        cutoff = np.nanpercentile(performance, effective_threshold * 100)  # Use np.nanpercentile to ignore nan values
         selected_samples = samples[performance <= cutoff]
         
-        if len(selected_samples) >= 3:
+        print(f"Iteration {iteration}: Number of selected samples: {len(selected_samples)}")
+
+        if desired_samples is not None and len(selected_samples) >= desired_samples:
+            print(f"Iteration {iteration}: Desired number of samples reached.")
+            break
+        if len(selected_samples) == len(samples):
+            print(f"Iteration {iteration}: All samples selected.")
             break
         threshold += adapt_rate
 
+    if iteration == max_iterations:
+        print(f"Final threshold after max iterations: {threshold}")
+        print(f"Performance values: {performance}")
+        print(f"Number of selected samples: {len(selected_samples)}")
+        if desired_samples is not None and len(selected_samples) < desired_samples:
+            print("Falling back to the best available samples.")
+            # Select the top desired_samples samples based on performance
+            best_indices = np.argsort(performance)[:desired_samples]
+            selected_samples = samples[best_indices]
+    
     if selected_samples.size == 0:
         raise ValueError("Selection of informative instances resulted in an empty set.")
-    
-    print(f"Selected samples shape: {selected_samples.shape}")
-    return selected_samples
+
+    print(f"Final selected samples shape: {selected_samples.shape}")
+    return selected_samples[:desired_samples]  # Ensure the number of selected samples matches the desired number
 
 # Iterative Refinement
-def iterative_refinement(samples, results, objective_function, maximize_indices, n_iterations=5, initial_threshold=0.15, adapt_rate=0.05):
+# Iterative Refinement
+def iterative_refinement(samples, results, objective_function, maximize_indices, n_iterations=2, initial_threshold=0.10, adapt_rate=0.03):
     for iteration in range(n_iterations):
         print(f"Iteration {iteration}: Starting with samples shape: {samples.shape}")
 
@@ -88,29 +110,35 @@ def iterative_refinement(samples, results, objective_function, maximize_indices,
         # Ensure objective negation is correctly handled
         for i in range(len(current_results)):
             for idx in maximize_indices:
-                current_results[i][idx] = -current_results[i][idx]
+                if not np.isnan(current_results[i][idx]):
+                    current_results[i][idx] = -current_results[i][idx]
 
         # Ensure at least a minimum number of samples are selected to maintain diversity
         if len(selected_samples) < 3:
-            selected_samples = samples[np.argsort(np.mean(current_results, axis=1))[:3]]
+            selected_samples = samples[np.argsort(np.nanmean(current_results, axis=1))[:3]]
         
         # Re-cluster the selected samples
-        n_clusters = max(1, min(3, int(len(selected_samples) * 0.3)))  # Ensure at least 1 cluster, maximum 3 clusters
+        n_clusters = max(1, min(2, int(len(selected_samples) * 0.2)))  # Ensure at least 1 cluster, maximum 2 clusters
         labels, centers = advanced_clustering_samples(selected_samples, n_clusters)
         
         # Generate new samples around cluster centers
         new_samples = []
         for center in centers:
-            for _ in range(1):  # Generate 1 new sample per center to control the growth of sample size
-                perturbations = np.random.uniform(-0.05, 0.05, center.shape)  # Use smaller perturbations for finer adjustments
+            for _ in range(max(1, (len(samples) - len(selected_samples)))):  # Control the number of new samples
+                perturbations = np.random.uniform(-0.03, 0.03, center.shape)  # Use smaller perturbations for finer adjustments
                 new_samples.append(center + perturbations)
         
         # Combine selected samples with new samples, ensuring we don't grow the sample size too much
-        samples = np.vstack((selected_samples, new_samples))
-        if len(samples) > len(selected_samples) + 2:  # Limit the growth of samples
-            samples = samples[:len(selected_samples) + 2]
+        combined_samples = np.vstack((selected_samples, new_samples))
+        samples = combined_samples[:len(selected_samples) + (len(samples) - len(selected_samples))]  # Ensure the sample size matches the original size
         
         # Debugging output
         print(f"Iteration {iteration}: Samples shape after selection and new sample generation: {samples.shape}")
     
-    return samples
\ No newline at end of file
+    return samples
+
+def generate_new_samples(existing_samples, pop_size, n_adaptive_samples):
+    n_new_samples = pop_size - n_adaptive_samples
+    new_samples = initial_sampling(PARAM_BOUNDS, n_new_samples)
+    combined_samples = np.vstack((existing_samples, new_samples))
+    return combined_samples
\ No newline at end of file
diff --git a/src/optimize_main.py b/src/optimize_main.py
index 09337d3..cda4981 100644
--- a/src/optimize_main.py
+++ b/src/optimize_main.py
@@ -17,14 +17,15 @@ from pymoo.optimize import minimize
 from scipy.stats import ttest_ind
 from optimization_libraries import initialize_algorithm
 from parallel_computing import execute_parallel_tasks, cleanup_temp_dirs
-from config import PARAMETERS, OBJECTIVES, MAXIMIZE, PARAM_BOUND_VALUES, PARAM_TYPES, PRECISION, PLOT_CONFIG, OPTIMIZATION_CONFIG, N_JOBS
+from config import PARAMETERS, OBJECTIVE_NAMES, MAXIMIZE, PARAM_BOUND_VALUES, PARAM_TYPES, PRECISION, PLOT_CONFIG, OPTIMIZATION_CONFIG, N_JOBS
+from adaptive_instance_selection import initial_sampling, evaluate_samples, advanced_clustering_samples, adaptive_select_informative_instances, iterative_refinement, generate_new_samples
 
 class OptimizationProblem(Problem):
     def __init__(self):
         self.param_names = list(PARAM_BOUND_VALUES.keys())
         self.param_types = [PARAM_TYPES[param] for param in self.param_names]
-        self.objective_names = OBJECTIVES
-        self.maximize_indices = [self.objective_names.index(res) for res in MAXIMIZE]
+        self.objective_names = OBJECTIVE_NAMES
+        self.maximize_indices = [i for i, maximize in enumerate(MAXIMIZE) if maximize]
         n_var = len(self.param_names)
         n_obj = len(self.objective_names)
         xl = np.array([PARAM_BOUND_VALUES[param][0] for param in self.param_names])
@@ -78,21 +79,75 @@ def run_optimization(use_adaptive_instance_selection):
     results_folder = create_results_folder()
 
     # Set the adaptive instance selection flag
-    OPTIMIZATION_CONFIG["USE_ADAPTIVE_INSTANCE_SELECTION"] = use_adaptive_instance_selection
+    OPTIMIZATION_CONFIG['USE_ADAPTIVE_INSTANCE_SELECTION'] = use_adaptive_instance_selection
+    adaptive_frequency = OPTIMIZATION_CONFIG['ADAPTIVE_INSTANCE_SELECTION_FREQUENCY']
+
+    # Initialize the population size
+    pop_size = OPTIMIZATION_CONFIG['POP_SIZE']
 
     # Initialize the optimization algorithm
     algorithm = initialize_algorithm(
             OPTIMIZATION_CONFIG['ALGORITHM_NAME'],
-            OPTIMIZATION_CONFIG.get('POP_SIZE')
+            pop_size
         )
 
     # Define the optimization problem
     problem = OptimizationProblem()
 
     start_time = time.time()
+    res = None  # Initialize res to handle early termination case
     try:
         # Run the optimization
-        res = minimize(problem, algorithm, ("n_gen", OPTIMIZATION_CONFIG['N_GEN']), verbose=True)
+        for gen in range(OPTIMIZATION_CONFIG['N_GEN']):
+            if pop_size <= OPTIMIZATION_CONFIG['MIN_POP_SIZE']:
+                print("Stopping optimization as population size has reached the minimum threshold.")
+                break
+
+            res = minimize(problem, algorithm, ("n_gen", 1), verbose=True)
+
+            if use_adaptive_instance_selection and gen > 0 and (gen + 1) % adaptive_frequency == 0:
+                current_samples = res.pop.get("X")
+                current_results = res.pop.get("F")
+
+                print(f"Generation {gen + 1}: Applying adaptive instance selection")
+                print(f"Current samples: {current_samples.shape}")
+                print(f"Current results: {current_results.shape}")
+
+                try:
+                    # Apply adaptive instance selection
+                    adaptive_samples = adaptive_select_informative_instances(
+                        current_samples, current_results, 
+                        initial_threshold=0.05, adapt_rate=0.01, 
+                        desired_samples=pop_size // 2
+                    )
+
+                    # Select half from algorithm population and half from adaptive instance selection
+                    num_algorithm_samples = pop_size // 2
+                    algorithm_samples_indices = np.random.choice(len(current_samples), num_algorithm_samples, replace=False)
+                    algorithm_samples = current_samples[algorithm_samples_indices]
+
+                    combined_samples = np.vstack((algorithm_samples, adaptive_samples))
+
+                    # Evaluate all combined samples
+                    out = {"F": np.zeros((len(combined_samples), len(problem.objective_names)))}  # Initialize output
+                    problem._evaluate(combined_samples, out=out)  # Evaluate combined samples
+
+                    res.pop.set("X", combined_samples[:pop_size])  # Set only the first pop_size samples
+                    res.pop.set("F", np.array(out["F"])[:pop_size])  # Set only the first pop_size results
+
+                except RuntimeError as e:
+                    print(f"Adaptive instance selection failed: {e}")
+                    # If adaptive instance selection fails, fall back to using the current population
+                    res.pop.set("X", current_samples)
+                    res.pop.set("F", current_results)
+
+                # Reduce population size dynamically
+                pop_size = max(OPTIMIZATION_CONFIG['MIN_POP_SIZE'], int(pop_size * 0.9))  # Reduce by 10% each iteration, minimum threshold
+                algorithm = initialize_algorithm(
+                    OPTIMIZATION_CONFIG['ALGORITHM_NAME'],
+                    pop_size
+                )
+    
     finally:
         # Cleanup temporary directories
         cleanup_temp_dirs()
@@ -100,26 +155,28 @@ def run_optimization(use_adaptive_instance_selection):
 
     elapsed_time = end_time - start_time
     print(f"Time with{'out' if not use_adaptive_instance_selection else ''} adaptive instance selection: {elapsed_time:.2f} seconds")
-    print_and_plot_results(res, problem)
-
-    # Save results to a file
-    # Negate back the maximized objectives before saving
-    results_to_save = res.F.copy()
-    for i in range(len(results_to_save)):
-        for idx in problem.maximize_indices:
-            results_to_save[i][idx] = -results_to_save[i][idx]
-
-    results_data = {
-        "results": res.F.tolist(),
-        "elapsed_time": elapsed_time,
-        "use_adaptive_instance_selection": use_adaptive_instance_selection
-    }
-    filename = os.path.join(results_folder, f'optimization_results_{"with" if use_adaptive_instance_selection else "without"}_adaptive.json')
-    with open(filename, 'w') as f:
-        json.dump(results_data, f)
-    print(f"Results have been stored in: {filename}")
-
-    return res.F, elapsed_time
+    
+    if res is not None:
+        print_and_plot_results(res, problem)
+
+        # Save results to a file
+        # Negate back the maximized objectives before saving
+        results_to_save = res.F.copy()
+        for i in range(len(results_to_save)):
+            for idx in problem.maximize_indices:
+                results_to_save[i][idx] = -results_to_save[i][idx]
+
+        results_data = {
+            "results": results_to_save.tolist(),
+            "elapsed_time": elapsed_time,
+            "use_adaptive_instance_selection": use_adaptive_instance_selection
+        }
+        filename = os.path.join(results_folder, f'optimization_results_{"with" if use_adaptive_instance_selection else "without"}_adaptive.json')
+        with open(filename, 'w') as f:
+            json.dump(results_data, f)
+        print(f"Results have been stored in: {filename}")
+
+    return res.F if res is not None else None, elapsed_time
 
 def print_and_plot_results(res, problem):
     print("Optimization Results:")
@@ -136,9 +193,10 @@ def print_and_plot_results(res, problem):
         print()
 
     try: 
-        plt.figure(figsize=(8, 6))
         for idx in problem.maximize_indices:
             res.F[:, idx] = -res.F[:, idx]
+    
+        plt.figure(figsize=(8, 6))
         plt.scatter(res.F[:, 0], res.F[:, 1])
         plt.xlabel(PLOT_CONFIG["PLOT_X"], fontsize=14)
         plt.ylabel(PLOT_CONFIG["PLOT_Y"], fontsize=14)
-- 
GitLab