85 lines
3.5 KiB
Python
85 lines
3.5 KiB
Python
import boto3
|
|
import os
|
|
import logging
|
|
import pandas as pd
|
|
from botocore.exceptions import ClientError
|
|
from io import BytesIO
|
|
|
|
# Configure logging
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class MinIOStorageClient:
|
|
"""
|
|
A professional client for interacting with GeoCrop MinIO storage.
|
|
Optimized for JupyterLab training workflows.
|
|
"""
|
|
def __init__(self):
|
|
# Configuration from Kubernetes Environment Variables
|
|
self.endpoint_url = os.environ.get('AWS_S3_ENDPOINT_URL', 'http://minio.geocrop.svc.cluster.local:9000')
|
|
self.access_key = os.environ.get('AWS_ACCESS_KEY_ID')
|
|
self.secret_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
|
|
|
|
self.s3_client = boto3.client(
|
|
's3',
|
|
endpoint_url=self.endpoint_url,
|
|
aws_access_key_id=self.access_key,
|
|
aws_secret_access_key=self.secret_key,
|
|
config=boto3.session.Config(signature_version='s3v4'),
|
|
verify=False # Required for internal self-signed K3s certs
|
|
)
|
|
|
|
def list_files(self, bucket_name: str, prefix: str = "") -> list:
|
|
"""Lists metadata for all files in a bucket/folder."""
|
|
try:
|
|
response = self.s3_client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
|
|
files = []
|
|
if 'Contents' in response:
|
|
for obj in response['Contents']:
|
|
files.append({
|
|
"key": obj['Key'],
|
|
"size_mb": round(obj['Size'] / (1024 * 1024), 2),
|
|
"last_modified": obj['LastModified']
|
|
})
|
|
return files
|
|
except ClientError as e:
|
|
logger.error(f"Failed to list {bucket_name}: {e}")
|
|
return []
|
|
|
|
def load_dataset(self, bucket_name: str, object_name: str) -> pd.DataFrame:
|
|
"""
|
|
Reads a CSV directly from MinIO into a Pandas DataFrame without saving to disk.
|
|
Best for training on large batches.
|
|
"""
|
|
try:
|
|
logger.info(f"Streaming s3://{bucket_name}/{object_name} into memory...")
|
|
obj = self.s3_client.get_object(Bucket=bucket_name, Key=object_name)
|
|
return pd.read_csv(BytesIO(obj['Body'].read()))
|
|
except Exception as e:
|
|
logger.error(f"Failed to load dataset {object_name}: {e}")
|
|
return pd.DataFrame()
|
|
|
|
def download_file(self, bucket_name: str, object_name: str, local_path: str):
|
|
"""Downloads a binary file (e.g., a TIFF baseline) to local storage."""
|
|
try:
|
|
self.s3_client.download_file(bucket_name, object_name, local_path)
|
|
logger.info(f"Successfully downloaded to {local_path}")
|
|
except ClientError as e:
|
|
logger.error(f"Download failed: {e}")
|
|
|
|
def upload_artifact(self, local_path: str, bucket_name: str, object_name: str):
|
|
"""Uploads a local file (e.g., a trained model .pkl) back to MinIO."""
|
|
try:
|
|
self.s3_client.upload_file(local_path, bucket_name, object_name)
|
|
logger.info(f"Artifact uploaded to s3://{bucket_name}/{object_name}")
|
|
except ClientError as e:
|
|
logger.error(f"Upload failed: {e}")
|
|
|
|
def get_pandas_options(self):
|
|
"""Returns a dict to use with pd.read_csv('s3://...') if preferred."""
|
|
return {
|
|
"key": self.access_key,
|
|
"secret": self.secret_key,
|
|
"client_kwargs": {"endpoint_url": self.endpoint_url}
|
|
}
|