Update storage client with load_dataset and add comprehensive README

This commit is contained in:
fchinembiri 2026-04-23 22:29:19 +02:00
parent 182bf12e2d
commit bdc4d52f21
2 changed files with 136 additions and 49 deletions

68
README.md Normal file
View File

@ -0,0 +1,68 @@
# Sovereign MLOps Platform: GeoCrop LULC Portfolio
Welcome to the **Sovereign MLOps Platform**, a comprehensive self-hosted environment on K3s designed for end-to-end Land Use / Land Cover (LULC) crop-mapping in Zimbabwe.
This project showcases professional skills in **MLOps, Cloud-Native Architecture, Geospatial Analysis, and GitOps**.
## 🏗️ System Architecture
The platform is built on a robust, self-hosted Kubernetes (K3s) cluster with a focus on data sovereignty and scalability.
- **Source Control & CI/CD**: [Gitea](https://git.techarvest.co.zw) (Self-hosted GitHub alternative)
- **Infrastructure as Code**: Terraform (Managing K3s Namespaces & Quotas)
- **GitOps**: ArgoCD (Automated deployment from Git to Cluster)
- **Experiment Tracking**: [MLflow](https://ml.techarvest.co.zw) (Model versioning & metrics)
- **Interactive Workspace**: [JupyterLab](https://lab.techarvest.co.zw) (Data science & training)
- **Spatial Database**: Standalone PostgreSQL + PostGIS (Port 5433)
- **Object Storage**: MinIO (S3-compatible storage for datasets, baselines, and models)
- **Frontend**: React 19 + OpenLayers (Parallel loading of baselines and ML predictions)
- **Backend**: FastAPI + Redis Queue (Job orchestration)
- **Visualization**: TiTiler (Dynamic tile server for Cloud Optimized GeoTIFFs)
## 🗺️ UX Data Flow: Parallel Loading Strategy
To ensure a seamless user experience, the system implements a dual-loading strategy:
1. **Instant Context**: While waiting for ML inference, Dynamic World (DW) TIFF baselines (2015-2025) are immediately served from MinIO via TiTiler.
2. **Asynchronous Inference**: The ML worker processes heavy classification tasks in the background and overlays high-resolution predictions once complete.
## 🛠️ Training Workflow
Training is performed in **JupyterLab** using a custom `MinIOStorageClient` that bridges the gap between object storage and in-memory data processing.
### Using the MinIO Storage Client
```python
from training.storage_client import MinIOStorageClient
# Initialize client (uses environment variables automatically)
storage = MinIOStorageClient()
# List available training batches
batches = storage.list_files('geocrop-datasets')
# Load a batch directly into memory (No disk I/O)
df = storage.load_dataset('geocrop-datasets', 'batch_1.csv')
# Train your model and upload the artifact
# ... training code ...
storage.upload_file('model.pkl', 'geocrop-models', 'Zimbabwe_Ensemble_Model.pkl')
```
## 🚀 Deployment & GitOps
The platform follows a strict **GitOps** workflow:
1. All changes are committed to the `geocrop-platform` repository on Gitea.
2. Gitea Actions build and push containers to Docker Hub (`frankchine`).
3. ArgoCD monitors the `k8s/base` directory and automatically synchronizes the cluster state.
## 🖥️ Service Registry
- **Portfolio Frontend**: [portfolio.techarvest.co.zw](https://portfolio.techarvest.co.zw)
- **Source Control**: [git.techarvest.co.zw](https://git.techarvest.co.zw)
- **JupyterLab**: [lab.techarvest.co.zw](https://lab.techarvest.co.zw)
- **MLflow**: [ml.techarvest.co.zw](https://ml.techarvest.co.zw)
- **ArgoCD**: [cd.techarvest.co.zw](https://cd.techarvest.co.zw)
- **MinIO Console**: [console.minio.portfolio.techarvest.co.zw](https://console.minio.portfolio.techarvest.co.zw)
---
*Created and maintained by [fchinembiri](mailto:fchinembiri24@gmail.com).*

View File

@ -2,83 +2,102 @@ import boto3
import os
import logging
import pandas as pd
import io
from botocore.exceptions import ClientError
from io import BytesIO
# Configure logging
# Configure logging for the worker/training scripts
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class MinIOStorageClient:
"""
A professional client for interacting with GeoCrop MinIO storage.
Optimized for JupyterLab training workflows.
A reusable client for interacting with the local MinIO storage.
Handles Datasets (CSVs), Baselines (DW TIFFs), and Model Artifacts.
"""
def __init__(self):
# Configuration from Kubernetes Environment Variables
# Initialize S3 client using environment variables
# Defaults to the internal Kubernetes DNS for MinIO if not provided
self.endpoint_url = os.environ.get('AWS_S3_ENDPOINT_URL', 'http://minio.geocrop.svc.cluster.local:9000')
self.access_key = os.environ.get('AWS_ACCESS_KEY_ID')
self.secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
self.s3_client = boto3.client(
's3',
endpoint_url=self.endpoint_url,
aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key,
aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID'),
aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY'),
config=boto3.session.Config(signature_version='s3v4'),
verify=False # Required for internal self-signed K3s certs
verify=False # MinIO often uses self-signed certs internally
)
def list_files(self, bucket_name: str, prefix: str = "") -> list:
"""Lists metadata for all files in a bucket/folder."""
"""Lists files in a specific bucket, optionally filtered by a prefix folder."""
try:
response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
files = []
logger.info(f"Scanning bucket '{bucket_name}' with prefix '{prefix}'...")
if 'Contents' in response:
for obj in response['Contents']:
files.append({
"key": obj['Key'],
"size_mb": round(obj['Size'] / (1024 * 1024), 2),
"last_modified": obj['LastModified']
})
size_mb = obj['Size'] / 1024 / 1024
files.append({"key": obj['Key'], "size_mb": size_mb})
logger.info(f" - Found: {obj['Key']} ({size_mb:.2f} MB)")
else:
logger.warning(f"No files found in bucket {bucket_name}.")
return files
except ClientError as e:
logger.error(f"Failed to list {bucket_name}: {e}")
logger.error(f"Failed to list files in {bucket_name}: {e}")
return []
def download_file(self, bucket_name: str, object_name: str, download_path: str) -> bool:
"""Downloads a file from MinIO to the local pod/container storage."""
try:
logger.info(f"Downloading {object_name} from {bucket_name} to {download_path}...")
self.s3_client.download_file(bucket_name, object_name, download_path)
logger.info("Download complete.")
return True
except ClientError as e:
logger.error(f"Error downloading {object_name}: {e}")
return False
def upload_file(self, file_path: str, bucket_name: str, object_name: str) -> bool:
"""Uploads a local file (like a trained model or prediction COG) back to MinIO."""
try:
logger.info(f"Uploading {file_path} to {bucket_name}/{object_name}...")
self.s3_client.upload_file(file_path, bucket_name, object_name)
logger.info("Upload complete.")
return True
except ClientError as e:
logger.error(f"Error uploading {file_path}: {e}")
return False
def load_dataset(self, bucket_name: str, object_name: str) -> pd.DataFrame:
"""
Reads a CSV directly from MinIO into a Pandas DataFrame without saving to disk.
Best for training on large batches.
"""
"""Loads a CSV dataset directly from MinIO into a Pandas DataFrame in memory."""
try:
logger.info(f"Streaming s3://{bucket_name}/{object_name} into memory...")
obj = self.s3_client.get_object(Bucket=bucket_name, Key=object_name)
return pd.read_csv(BytesIO(obj['Body'].read()))
logger.info(f"Loading {object_name} from {bucket_name} into memory...")
response = self.s3_client.get_object(Bucket=bucket_name, Key=object_name)
df = pd.read_csv(io.BytesIO(response['Body'].read()))
logger.info(f"Successfully loaded dataset with shape: {df.shape}")
return df
except ClientError as e:
logger.error(f"Error loading {object_name}: {e}")
return None
except Exception as e:
logger.error(f"Failed to load dataset {object_name}: {e}")
return pd.DataFrame()
logger.error(f"Error parsing {object_name} into DataFrame: {e}")
return None
def download_file(self, bucket_name: str, object_name: str, local_path: str):
"""Downloads a binary file (e.g., a TIFF baseline) to local storage."""
try:
self.s3_client.download_file(bucket_name, object_name, local_path)
logger.info(f"Successfully downloaded to {local_path}")
except ClientError as e:
logger.error(f"Download failed: {e}")
# ==========================================
# Example Usage (For your Jupyter Notebooks)
# ==========================================
if __name__ == "__main__":
storage = MinIOStorageClient()
def upload_artifact(self, local_path: str, bucket_name: str, object_name: str):
"""Uploads a local file (e.g., a trained model .pkl) back to MinIO."""
try:
self.s3_client.upload_file(local_path, bucket_name, object_name)
logger.info(f"Artifact uploaded to s3://{bucket_name}/{object_name}")
except ClientError as e:
logger.error(f"Upload failed: {e}")
# 1. List the Zimbabwe Augmented CSV batches
datasets_bucket = 'geocrop-datasets'
csv_files = storage.list_files(datasets_bucket)
def get_pandas_options(self):
"""Returns a dict to use with pd.read_csv('s3://...') if preferred."""
return {
"key": self.access_key,
"secret": self.secret_key,
"client_kwargs": {"endpoint_url": self.endpoint_url}
}
# 2. Load a batch directly into memory for training
if csv_files:
first_batch_key = csv_files[0]['key']
df = storage.load_dataset(datasets_bucket, first_batch_key)
if df is not None:
print(df.info())