Add professional training template with MLflow, MinIO, and Inference generation

2026-04-23 22:57:46 +02:00 · 2026-04-23 22:57:46 +02:00 · a7f2dba8b2
parent 8817ba5233
commit a7f2dba8b2
1 changed files with 96 additions and 0 deletions
--- a/training/train_v2.py
+++ b/training/train_v2.py
@ -0,0 +1,96 @@
+import sys
+import os
+import logging
+from training.storage_client import MinIOStorageClient
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from xgboost import XGBClassifier
+import mlflow
+import mlflow.sklearn
+import joblib
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def train():
+    # 1. Setup Environment
+    # MLflow server URL (Internal K8s DNS)
+    mlflow.set_tracking_uri("http://mlflow.geocrop.svc.cluster.local:5000")
+    mlflow.set_experiment("GeoCrop_Retraining")
+    
+    storage = MinIOStorageClient()
+    
+    # 2. Fetch Data
+    logger.info("Fetching latest training data...")
+    # For now, let's load batch_1.csv as an example
+    df = storage.load_dataset('geocrop-datasets', 'batch_1.csv')
+    
+    if df is None or df.empty:
+        logger.error("No data found!")
+        return
+
+    # 3. Simple Feature Engineering (Placeholder)
+    # Filter columns to only include features and target
+    # FEATURE_ORDER_V1 (subset for example)
+    features = ['ndvi_peak', 'evi_peak', 'savi_peak'] 
+    target = 'class' # Update with actual column name
+    
+    X = df[features]
+    y = df[target]
+    
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # 4. Training with MLflow Tracking
+    with mlflow.start_run():
+        logger.info("Training model...")
+        model = XGBClassifier(n_estimators=100, max_depth=5)
+        model.fit(X_train, y_train)
+        
+        # Log Metrics
+        score = model.score(X_test, y_test)
+        mlflow.log_metric("accuracy", score)
+        logger.info(f"Accuracy: {score}")
+        
+        # Log Model to MLflow
+        mlflow.sklearn.log_model(model, "model")
+        
+        # 5. Save Artifact and Upload to MinIO
+        model_filename = "Zimbabwe_Ensemble_Model_latest.pkl"
+        joblib.dump(model, model_filename)
+        
+        storage.upload_file(model_filename, 'geocrop-models', model_filename)
+        logger.info(f"Uploaded model to MinIO: {model_filename}")
+
+        # 6. Generate Inference Script (Dynamic)
+        generate_inference_script(features)
+
+def generate_inference_script(features):
+    """
+    Creates a new inference script tailored to the trained model's features.
+    """
+    script_content = f"""
+import os
+from training.storage_client import MinIOStorageClient
+import joblib
+import pandas as pd
+
+def run_inference(aoi_data):
+    storage = MinIOStorageClient()
+    # Download the latest model
+    storage.download_file('geocrop-models', 'Zimbabwe_Ensemble_Model_latest.pkl', 'model.pkl')
+    model = joblib.load('model.pkl')
+    
+    # Preprocess aoi_data (should pull from DEA STAC here)
+    # features = {features}
+    # ... logic to fetch from DEA STAC and compute these features ...
+    
+    # result = model.predict(preprocessed_data)
+    # return result
+"""
+    with open("training/generated_inference.py", "w") as f:
+        f.write(script_content)
+    logger.info("Generated new inference script: training/generated_inference.py")
+
+if __name__ == "__main__":
+    train()