diff --git a/training/train_v2.py b/training/train_v2.py new file mode 100644 index 0000000..af35001 --- /dev/null +++ b/training/train_v2.py @@ -0,0 +1,96 @@ +import sys +import os +import logging +from training.storage_client import MinIOStorageClient +import pandas as pd +from sklearn.model_selection import train_test_split +from xgboost import XGBClassifier +import mlflow +import mlflow.sklearn +import joblib + +# Setup logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +def train(): + # 1. Setup Environment + # MLflow server URL (Internal K8s DNS) + mlflow.set_tracking_uri("http://mlflow.geocrop.svc.cluster.local:5000") + mlflow.set_experiment("GeoCrop_Retraining") + + storage = MinIOStorageClient() + + # 2. Fetch Data + logger.info("Fetching latest training data...") + # For now, let's load batch_1.csv as an example + df = storage.load_dataset('geocrop-datasets', 'batch_1.csv') + + if df is None or df.empty: + logger.error("No data found!") + return + + # 3. Simple Feature Engineering (Placeholder) + # Filter columns to only include features and target + # FEATURE_ORDER_V1 (subset for example) + features = ['ndvi_peak', 'evi_peak', 'savi_peak'] + target = 'class' # Update with actual column name + + X = df[features] + y = df[target] + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + # 4. Training with MLflow Tracking + with mlflow.start_run(): + logger.info("Training model...") + model = XGBClassifier(n_estimators=100, max_depth=5) + model.fit(X_train, y_train) + + # Log Metrics + score = model.score(X_test, y_test) + mlflow.log_metric("accuracy", score) + logger.info(f"Accuracy: {score}") + + # Log Model to MLflow + mlflow.sklearn.log_model(model, "model") + + # 5. Save Artifact and Upload to MinIO + model_filename = "Zimbabwe_Ensemble_Model_latest.pkl" + joblib.dump(model, model_filename) + + storage.upload_file(model_filename, 'geocrop-models', model_filename) + logger.info(f"Uploaded model to MinIO: {model_filename}") + + # 6. Generate Inference Script (Dynamic) + generate_inference_script(features) + +def generate_inference_script(features): + """ + Creates a new inference script tailored to the trained model's features. + """ + script_content = f""" +import os +from training.storage_client import MinIOStorageClient +import joblib +import pandas as pd + +def run_inference(aoi_data): + storage = MinIOStorageClient() + # Download the latest model + storage.download_file('geocrop-models', 'Zimbabwe_Ensemble_Model_latest.pkl', 'model.pkl') + model = joblib.load('model.pkl') + + # Preprocess aoi_data (should pull from DEA STAC here) + # features = {features} + # ... logic to fetch from DEA STAC and compute these features ... + + # result = model.predict(preprocessed_data) + # return result +""" + with open("training/generated_inference.py", "w") as f: + f.write(script_content) + logger.info("Generated new inference script: training/generated_inference.py") + +if __name__ == "__main__": + train()