update license, readme and example.py

rkhosrowshahi · rkhosrowshahi · commit a238a70a3edf · 2025-07-17T19:53:43.000-04:00
diff --git a/.zenodo.json b/.zenodo.json
diff --git a/LICENSE b/LICENSE
@@ -27,7 +27,7 @@ SOFTWARE.
 This software is provided for academic and research purposes. If you use this 
 code in your research, please cite the original paper:
 
-Khosrowshahli, R., Kheiri, F., Bidgoli, A.A., Tizhoosh, H.R., Makrehchi, M., & Rahnamayan, S. (2024). Enhancing Image Retrieval Through Optimal Barcode Representation.
+Khosrowshahli, R., Kheiri, F., Bidgoli, A.A., Tizhoosh, H.R., Makrehchi, M., & Rahnamayan, S. (2025). Enhancing Image Retrieval Through Optimal Barcode Representation.
 
 ## Data Usage
 
diff --git a/README.md b/README.md
@@ -86,7 +86,7 @@ python main.py
 #### Using CGA-dHash on TCGA Brain dataset:
 
 ```bash
-python main.py --dataset tcga_brain_kimianet --method CGA-dHash --k 10 --cga_n_gen 50
+python main.py --dataset tcga_brain_kimianet --method CGA-dHash --k 10 --cga_n_gen 100
 ```
 
 #### Using neural network methods:
@@ -113,7 +113,7 @@ python main.py --dataset cifar10 --method CGA-dHash --download
 - `--method`: Barcoding method (default: `CGA-dHash`)
 - `--k`: Number of nearest neighbors for evaluation (default: 10)
 - `--n_bits`: Number of bits for hash codes (default: 128)
-- `--feature_selection`: Enable feature selection
+- `--feature_selection`: Enable feature selection (only works for TCGA)
 - `--download`: Automatically download datasets if they don't exist
 - `--cga_n_gen`: Number of generations for CGA (default: 100)
 - `--cga_pop_size`: Population size for CGA (default: 100)
diff --git a/example.py b/example.py
@@ -2,7 +2,7 @@
 Example usage of Deep Feature Barcoding with Combinatorial Genetic Algorithm
 
 This script demonstrates how to use the various barcoding methods
-with synthetic data for testing and development purposes.
+with real Fashion-MNIST dataset for testing and evaluation.
 """
 
 import numpy as np
@@ -12,75 +12,10 @@
 # Add src to path to import modules
 sys.path.append(os.path.join(os.path.dirname(__file__), 'src'))
 
-from src.utils import evaluate_retrieval, setup_seed
+from src.utils import evaluate_retrieval, setup_seed, load_dataset
 from src.methods import CGA, AHash, DHash, MinMax, DFT, ITQ, LSH
 
 
-def generate_synthetic_data(n_samples=1000, n_features=100, n_classes=5, seed=42):
-    """
-    Generate synthetic dataset for testing barcoding methods.
-    
-    Args:
-        n_samples: Number of samples to generate
-        n_features: Number of features per sample
-        n_classes: Number of classes
-        seed: Random seed for reproducibility
-    
-    Returns:
-        Tuple of (features, labels)
-    """
-    np.random.seed(seed)
-    
-    # Generate random features with some class structure
-    features = []
-    labels = []
-    
-    for class_id in range(n_classes):
-        # Generate class-specific mean
-        class_mean = np.random.randn(n_features) * 2
-        
-        # Generate samples for this class
-        n_class_samples = n_samples // n_classes
-        if class_id < n_samples % n_classes:
-            n_class_samples += 1
-            
-        class_features = np.random.randn(n_class_samples, n_features) + class_mean
-        class_labels = np.full(n_class_samples, class_id)
-        
-        features.append(class_features)
-        labels.append(class_labels)
-    
-    features = np.vstack(features)
-    labels = np.hstack(labels)
-    
-    # Shuffle the data
-    shuffle_idx = np.random.permutation(len(features))
-    features = features[shuffle_idx]
-    labels = labels[shuffle_idx]
-    
-    return features, labels
-
-
-def split_data(features, labels, train_ratio=0.6, val_ratio=0.2, test_ratio=0.2):
-    """Split data into train, validation, and test sets."""
-    n_samples = len(features)
-    n_train = int(n_samples * train_ratio)
-    n_val = int(n_samples * val_ratio)
-    
-    train_features = features[:n_train]
-    train_labels = labels[:n_train]
-    
-    val_features = features[n_train:n_train + n_val]
-    val_labels = labels[n_train:n_train + n_val]
-    
-    test_features = features[n_train + n_val:]
-    test_labels = labels[n_train + n_val:]
-    
-    return (train_features, train_labels, 
-            val_features, val_labels, 
-            test_features, test_labels)
-
-
 def run_barcoding_example(method_name, barcoder, train_features, train_labels, 
                          test_features, test_labels, k=5):
     """
@@ -133,29 +68,27 @@ def main():
     # Set random seed for reproducibility
     setup_seed(42)
     
-    # Generate synthetic data
-    print("\nGenerating synthetic dataset...")
-    features, labels = generate_synthetic_data(
-        n_samples=1000, 
-        n_features=64, 
-        n_classes=5, 
-        seed=42
-    )
-    
-    print(f"Generated dataset: {features.shape[0]} samples, {features.shape[1]} features, {len(np.unique(labels))} classes")
-    
-    # Split data
-    print("Splitting data into train/val/test sets...")
-    train_features, train_labels, val_features, val_labels, test_features, test_labels = split_data(
-        features, labels
-    )
-    
-    print(f"Train: {len(train_features)} samples")
-    print(f"Validation: {len(val_features)} samples") 
-    print(f"Test: {len(test_features)} samples")
+    # Load Fashion-MNIST dataset
+    print("\nLoading Fashion-MNIST dataset...")
+    try:
+        train_features, train_labels, val_features, val_labels, test_features, test_labels = load_dataset(
+            dataset_name="fashion",
+            download=True  # Auto-download if not available
+        )
+        
+        print(f"Dataset loaded successfully!")
+        print(f"Train: {len(train_features)} samples, {train_features.shape[1]} features")
+        print(f"Validation: {len(val_features)} samples")
+        print(f"Test: {len(test_features)} samples")
+        print(f"Number of classes: {len(np.unique(train_labels))}")
+        
+    except Exception as e:
+        print(f"Error loading Fashion-MNIST dataset: {str(e)}")
+        print("Please check if the dataset is available or try downloading it manually.")
+        return
     
     # Initialize barcoding methods
-    num_features = features.shape[1]
+    num_features = train_features.shape[1]
     num_bits = 32  # Using smaller number of bits for faster computation in example
     k = 5  # Number of neighbors for evaluation
     
@@ -211,7 +144,7 @@ def main():
     
     # Print summary results
     print("\n" + "=" * 50)
-    print("SUMMARY RESULTS")
+    print("SUMMARY RESULTS - Fashion-MNIST Dataset")
     print("=" * 50)
     print(f"{'Method':<15} {'F1':<8} {'Prec@{k}':<8} {'mAP':<8}".format(k=k))
     print("-" * 50)
@@ -223,8 +156,9 @@ def main():
             print(f"{method_name:<15} {'ERROR':<8} {'ERROR':<8} {'ERROR':<8}")
     
     print("\nExample completed successfully!")
-    print("\nNote: This example uses synthetic data and reduced parameters for speed.")
-    print("For real experiments, use larger populations, more generations, and real datasets.")
+    print("\nNote: This example uses Fashion-MNIST dataset with reduced parameters for speed.")
+    print("For real experiments, use larger populations, more generations, and other datasets.")
+    print("\nAvailable datasets: fashion, cifar10, cifar100, covid19, and various TCGA medical datasets.")
 
 
 if __name__ == "__main__":