Add professional training template with MLflow, MinIO, and Inference generation
This commit is contained in:
parent
8817ba5233
commit
a7f2dba8b2
|
|
@ -0,0 +1,96 @@
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
from training.storage_client import MinIOStorageClient
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from xgboost import XGBClassifier
|
||||||
|
import mlflow
|
||||||
|
import mlflow.sklearn
|
||||||
|
import joblib
|
||||||
|
|
||||||
|
# Setup logging
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def train():
|
||||||
|
# 1. Setup Environment
|
||||||
|
# MLflow server URL (Internal K8s DNS)
|
||||||
|
mlflow.set_tracking_uri("http://mlflow.geocrop.svc.cluster.local:5000")
|
||||||
|
mlflow.set_experiment("GeoCrop_Retraining")
|
||||||
|
|
||||||
|
storage = MinIOStorageClient()
|
||||||
|
|
||||||
|
# 2. Fetch Data
|
||||||
|
logger.info("Fetching latest training data...")
|
||||||
|
# For now, let's load batch_1.csv as an example
|
||||||
|
df = storage.load_dataset('geocrop-datasets', 'batch_1.csv')
|
||||||
|
|
||||||
|
if df is None or df.empty:
|
||||||
|
logger.error("No data found!")
|
||||||
|
return
|
||||||
|
|
||||||
|
# 3. Simple Feature Engineering (Placeholder)
|
||||||
|
# Filter columns to only include features and target
|
||||||
|
# FEATURE_ORDER_V1 (subset for example)
|
||||||
|
features = ['ndvi_peak', 'evi_peak', 'savi_peak']
|
||||||
|
target = 'class' # Update with actual column name
|
||||||
|
|
||||||
|
X = df[features]
|
||||||
|
y = df[target]
|
||||||
|
|
||||||
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
||||||
|
|
||||||
|
# 4. Training with MLflow Tracking
|
||||||
|
with mlflow.start_run():
|
||||||
|
logger.info("Training model...")
|
||||||
|
model = XGBClassifier(n_estimators=100, max_depth=5)
|
||||||
|
model.fit(X_train, y_train)
|
||||||
|
|
||||||
|
# Log Metrics
|
||||||
|
score = model.score(X_test, y_test)
|
||||||
|
mlflow.log_metric("accuracy", score)
|
||||||
|
logger.info(f"Accuracy: {score}")
|
||||||
|
|
||||||
|
# Log Model to MLflow
|
||||||
|
mlflow.sklearn.log_model(model, "model")
|
||||||
|
|
||||||
|
# 5. Save Artifact and Upload to MinIO
|
||||||
|
model_filename = "Zimbabwe_Ensemble_Model_latest.pkl"
|
||||||
|
joblib.dump(model, model_filename)
|
||||||
|
|
||||||
|
storage.upload_file(model_filename, 'geocrop-models', model_filename)
|
||||||
|
logger.info(f"Uploaded model to MinIO: {model_filename}")
|
||||||
|
|
||||||
|
# 6. Generate Inference Script (Dynamic)
|
||||||
|
generate_inference_script(features)
|
||||||
|
|
||||||
|
def generate_inference_script(features):
|
||||||
|
"""
|
||||||
|
Creates a new inference script tailored to the trained model's features.
|
||||||
|
"""
|
||||||
|
script_content = f"""
|
||||||
|
import os
|
||||||
|
from training.storage_client import MinIOStorageClient
|
||||||
|
import joblib
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
def run_inference(aoi_data):
|
||||||
|
storage = MinIOStorageClient()
|
||||||
|
# Download the latest model
|
||||||
|
storage.download_file('geocrop-models', 'Zimbabwe_Ensemble_Model_latest.pkl', 'model.pkl')
|
||||||
|
model = joblib.load('model.pkl')
|
||||||
|
|
||||||
|
# Preprocess aoi_data (should pull from DEA STAC here)
|
||||||
|
# features = {features}
|
||||||
|
# ... logic to fetch from DEA STAC and compute these features ...
|
||||||
|
|
||||||
|
# result = model.predict(preprocessed_data)
|
||||||
|
# return result
|
||||||
|
"""
|
||||||
|
with open("training/generated_inference.py", "w") as f:
|
||||||
|
f.write(script_content)
|
||||||
|
logger.info("Generated new inference script: training/generated_inference.py")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
train()
|
||||||
Loading…
Reference in New Issue