mirror of
https://github.com/ghndrx/file-transformer-s3.git
synced 2026-02-10 06:45:05 +00:00
Initial commit: File Transformer S3 project with React dashboard and Knative functions
This commit is contained in:
29
functions/Makefile
Normal file
29
functions/Makefile
Normal file
@@ -0,0 +1,29 @@
|
||||
.PHONY: build build-upload build-transform build-download build-metadata clean
|
||||
|
||||
# Build all functions
|
||||
build: build-upload build-transform build-download build-metadata
|
||||
|
||||
# Build upload function
|
||||
build-upload:
|
||||
@echo "Building upload function..."
|
||||
@cd upload && docker build -t function-upload:latest .
|
||||
|
||||
# Build transform function
|
||||
build-transform:
|
||||
@echo "Building transform function..."
|
||||
@cd transform && docker build -t function-transform:latest .
|
||||
|
||||
# Build download function
|
||||
build-download:
|
||||
@echo "Building download function..."
|
||||
@cd download && docker build -t function-download:latest .
|
||||
|
||||
# Build metadata function
|
||||
build-metadata:
|
||||
@echo "Building metadata function..."
|
||||
@cd metadata && docker build -t function-metadata:latest .
|
||||
|
||||
# Clean all function images
|
||||
clean:
|
||||
@echo "Cleaning function images..."
|
||||
@docker rmi function-upload:latest function-transform:latest function-download:latest function-metadata:latest 2>/dev/null || true
|
||||
30
functions/download/Dockerfile
Normal file
30
functions/download/Dockerfile
Normal file
@@ -0,0 +1,30 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY app.py .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
|
||||
USER app
|
||||
|
||||
# Expose port
|
||||
EXPOSE 5000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:5000/health || exit 1
|
||||
|
||||
# Run the application
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "300", "app:app"]
|
||||
201
functions/download/app.py
Normal file
201
functions/download/app.py
Normal file
@@ -0,0 +1,201 @@
|
||||
import os
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from flask import Flask, request, jsonify, send_file
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
import structlog
|
||||
import io
|
||||
|
||||
# Configure structured logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
structlog.processors.JSONRenderer()
|
||||
],
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Configuration
|
||||
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
|
||||
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
|
||||
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
|
||||
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
|
||||
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
|
||||
|
||||
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=MINIO_USE_SSL
|
||||
)
|
||||
|
||||
def get_db_connection():
|
||||
"""Create a database connection."""
|
||||
return psycopg2.connect(POSTGRES_URL)
|
||||
|
||||
def get_file_info(file_id: str) -> Optional[dict]:
|
||||
"""Get file information from database."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, filename, original_filename, file_size, file_type,
|
||||
mime_type, bucket_name, object_key, status
|
||||
FROM files
|
||||
WHERE id = %s AND status != 'deleted'
|
||||
""", (file_id,))
|
||||
|
||||
file_record = cur.fetchone()
|
||||
return dict(file_record) if file_record else None
|
||||
except Exception as e:
|
||||
logger.error("Failed to get file info", error=str(e))
|
||||
return None
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def log_file_access(file_id: str, action: str, ip_address: str, user_agent: Optional[str]):
|
||||
"""Log file access for audit purposes."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO file_access_logs (file_id, action, ip_address, user_agent)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
""", (file_id, action, ip_address, user_agent))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error("Failed to log file access", error=str(e))
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
"""Health check endpoint."""
|
||||
return jsonify({'status': 'healthy', 'service': 'file-download'})
|
||||
|
||||
@app.route('/download/<file_id>', methods=['GET'])
|
||||
def download_file(file_id: str):
|
||||
"""Download file by ID."""
|
||||
try:
|
||||
# Get file information
|
||||
file_info = get_file_info(file_id)
|
||||
if not file_info:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
if file_info['status'] == 'deleted':
|
||||
return jsonify({'error': 'File has been deleted'}), 404
|
||||
|
||||
# Get file from MinIO
|
||||
try:
|
||||
response = minio_client.get_object(
|
||||
file_info['bucket_name'],
|
||||
file_info['object_key']
|
||||
)
|
||||
file_data = response.read()
|
||||
response.close()
|
||||
response.release_conn()
|
||||
except S3Error as e:
|
||||
logger.error("Failed to get file from MinIO", error=str(e))
|
||||
return jsonify({'error': 'File not found in storage'}), 404
|
||||
|
||||
# Log access
|
||||
log_file_access(file_id, 'download', request.remote_addr, request.headers.get('User-Agent'))
|
||||
|
||||
# Create file-like object for Flask to serve
|
||||
file_stream = io.BytesIO(file_data)
|
||||
file_stream.seek(0)
|
||||
|
||||
logger.info("File download completed",
|
||||
file_id=file_id,
|
||||
filename=file_info['filename'],
|
||||
size=len(file_data))
|
||||
|
||||
return send_file(
|
||||
file_stream,
|
||||
mimetype=file_info['mime_type'],
|
||||
as_attachment=True,
|
||||
download_name=file_info['original_filename']
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Download error", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/files/<file_id>/info', methods=['GET'])
|
||||
def get_file_info_endpoint(file_id: str):
|
||||
"""Get file information without downloading."""
|
||||
try:
|
||||
file_info = get_file_info(file_id)
|
||||
if not file_info:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
# Log access
|
||||
log_file_access(file_id, 'view', request.remote_addr, request.headers.get('User-Agent'))
|
||||
|
||||
return jsonify(file_info), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching file info", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/files/<file_id>/url', methods=['GET'])
|
||||
def get_download_url(file_id: str):
|
||||
"""Get presigned download URL."""
|
||||
try:
|
||||
file_info = get_file_info(file_id)
|
||||
if not file_info:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
if file_info['status'] == 'deleted':
|
||||
return jsonify({'error': 'File has been deleted'}), 404
|
||||
|
||||
# Generate presigned URL
|
||||
try:
|
||||
url = minio_client.presigned_get_object(
|
||||
file_info['bucket_name'],
|
||||
file_info['object_key'],
|
||||
expires=3600 # 1 hour
|
||||
)
|
||||
except S3Error as e:
|
||||
logger.error("Failed to generate presigned URL", error=str(e))
|
||||
return jsonify({'error': 'Failed to generate download URL'}), 500
|
||||
|
||||
# Log access
|
||||
log_file_access(file_id, 'url_generated', request.remote_addr, request.headers.get('User-Agent'))
|
||||
|
||||
return jsonify({
|
||||
'file_id': file_id,
|
||||
'filename': file_info['original_filename'],
|
||||
'download_url': url,
|
||||
'expires_in': 3600
|
||||
}), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error generating download URL", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
24
functions/download/requirements.txt
Normal file
24
functions/download/requirements.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
# Core dependencies
|
||||
flask==2.3.3
|
||||
gunicorn==21.2.0
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# Database
|
||||
psycopg2-binary==2.9.7
|
||||
sqlalchemy==2.0.21
|
||||
|
||||
# MinIO/S3
|
||||
minio==7.1.17
|
||||
boto3==1.28.44
|
||||
|
||||
# HTTP requests
|
||||
requests==2.31.0
|
||||
|
||||
# JSON and data handling
|
||||
pydantic==2.1.1
|
||||
|
||||
# Logging
|
||||
structlog==23.1.0
|
||||
|
||||
# Utilities
|
||||
python-dateutil==2.8.2
|
||||
30
functions/metadata/Dockerfile
Normal file
30
functions/metadata/Dockerfile
Normal file
@@ -0,0 +1,30 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY app.py .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
|
||||
USER app
|
||||
|
||||
# Expose port
|
||||
EXPOSE 5000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:5000/health || exit 1
|
||||
|
||||
# Run the application
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "300", "app:app"]
|
||||
307
functions/metadata/app.py
Normal file
307
functions/metadata/app.py
Normal file
@@ -0,0 +1,307 @@
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
import structlog
|
||||
|
||||
# Configure structured logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
structlog.processors.JSONRenderer()
|
||||
],
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Configuration
|
||||
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
|
||||
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
|
||||
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
|
||||
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
|
||||
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
|
||||
|
||||
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=MINIO_USE_SSL
|
||||
)
|
||||
|
||||
def get_db_connection():
|
||||
"""Create a database connection."""
|
||||
return psycopg2.connect(POSTGRES_URL)
|
||||
|
||||
def get_file_metadata(file_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get file metadata from database."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, filename, original_filename, file_size, file_type,
|
||||
mime_type, bucket_name, object_key, checksum, status,
|
||||
transformation_type, transformation_config, metadata,
|
||||
created_at, updated_at, processed_at
|
||||
FROM files
|
||||
WHERE id = %s
|
||||
""", (file_id,))
|
||||
|
||||
file_record = cur.fetchone()
|
||||
return dict(file_record) if file_record else None
|
||||
except Exception as e:
|
||||
logger.error("Failed to get file metadata", error=str(e))
|
||||
return None
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_file_transformations(file_id: str) -> list:
|
||||
"""Get transformations for a file."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, transformation_type, input_path, output_path,
|
||||
status, config, result, error_message,
|
||||
started_at, completed_at, created_at
|
||||
FROM transformations
|
||||
WHERE file_id = %s
|
||||
ORDER BY created_at DESC
|
||||
""", (file_id,))
|
||||
|
||||
transformations = cur.fetchall()
|
||||
return [dict(t) for t in transformations]
|
||||
except Exception as e:
|
||||
logger.error("Failed to get file transformations", error=str(e))
|
||||
return []
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_file_access_logs(file_id: str, limit: int = 50) -> list:
|
||||
"""Get access logs for a file."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, action, ip_address, user_agent, created_at
|
||||
FROM file_access_logs
|
||||
WHERE file_id = %s
|
||||
ORDER BY created_at DESC
|
||||
LIMIT %s
|
||||
""", (file_id, limit))
|
||||
|
||||
logs = cur.fetchall()
|
||||
return [dict(log) for log in logs]
|
||||
except Exception as e:
|
||||
logger.error("Failed to get file access logs", error=str(e))
|
||||
return []
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def update_file_metadata(file_id: str, metadata: Dict[str, Any]) -> bool:
|
||||
"""Update file metadata."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE files
|
||||
SET metadata = %s, updated_at = %s
|
||||
WHERE id = %s
|
||||
""", (json.dumps(metadata), datetime.utcnow(), file_id))
|
||||
conn.commit()
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error("Failed to update file metadata", error=str(e))
|
||||
conn.rollback()
|
||||
return False
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_storage_stats() -> Dict[str, Any]:
|
||||
"""Get storage statistics."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Total files and size
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) as total_files,
|
||||
SUM(file_size) as total_size,
|
||||
COUNT(CASE WHEN status = 'uploaded' THEN 1 END) as uploaded_files,
|
||||
COUNT(CASE WHEN status = 'processing' THEN 1 END) as processing_files,
|
||||
COUNT(CASE WHEN status = 'transformed' THEN 1 END) as transformed_files,
|
||||
COUNT(CASE WHEN status = 'error' THEN 1 END) as error_files,
|
||||
COUNT(CASE WHEN status = 'deleted' THEN 1 END) as deleted_files
|
||||
FROM files
|
||||
""")
|
||||
stats = cur.fetchone()
|
||||
|
||||
# File types distribution
|
||||
cur.execute("""
|
||||
SELECT file_type, COUNT(*) as count
|
||||
FROM files
|
||||
WHERE status != 'deleted'
|
||||
GROUP BY file_type
|
||||
ORDER BY count DESC
|
||||
""")
|
||||
file_types = cur.fetchall()
|
||||
|
||||
# Recent activity
|
||||
cur.execute("""
|
||||
SELECT COUNT(*) as recent_uploads
|
||||
FROM files
|
||||
WHERE created_at >= NOW() - INTERVAL '24 hours'
|
||||
""")
|
||||
recent = cur.fetchone()
|
||||
|
||||
return {
|
||||
'stats': dict(stats),
|
||||
'file_types': [dict(ft) for ft in file_types],
|
||||
'recent_uploads': recent['recent_uploads'] if recent else 0
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("Failed to get storage stats", error=str(e))
|
||||
return {}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
"""Health check endpoint."""
|
||||
return jsonify({'status': 'healthy', 'service': 'file-metadata'})
|
||||
|
||||
@app.route('/files/<file_id>/metadata', methods=['GET'])
|
||||
def get_file_metadata_endpoint(file_id: str):
|
||||
"""Get comprehensive file metadata."""
|
||||
try:
|
||||
# Get basic file metadata
|
||||
file_metadata = get_file_metadata(file_id)
|
||||
if not file_metadata:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
# Get transformations
|
||||
transformations = get_file_transformations(file_id)
|
||||
|
||||
# Get recent access logs
|
||||
access_logs = get_file_access_logs(file_id, limit=10)
|
||||
|
||||
# Check if file exists in MinIO
|
||||
minio_exists = False
|
||||
try:
|
||||
minio_client.stat_object(
|
||||
file_metadata['bucket_name'],
|
||||
file_metadata['object_key']
|
||||
)
|
||||
minio_exists = True
|
||||
except S3Error:
|
||||
minio_exists = False
|
||||
|
||||
response_data = {
|
||||
'file': file_metadata,
|
||||
'transformations': transformations,
|
||||
'access_logs': access_logs,
|
||||
'storage': {
|
||||
'minio_exists': minio_exists,
|
||||
'bucket': file_metadata['bucket_name'],
|
||||
'object_key': file_metadata['object_key']
|
||||
}
|
||||
}
|
||||
|
||||
return jsonify(response_data), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching file metadata", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/files/<file_id>/metadata', methods=['PUT'])
|
||||
def update_file_metadata_endpoint(file_id: str):
|
||||
"""Update file metadata."""
|
||||
try:
|
||||
data = request.get_json()
|
||||
if not data:
|
||||
return jsonify({'error': 'No data provided'}), 400
|
||||
|
||||
# Check if file exists
|
||||
file_metadata = get_file_metadata(file_id)
|
||||
if not file_metadata:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
# Update metadata
|
||||
success = update_file_metadata(file_id, data)
|
||||
if not success:
|
||||
return jsonify({'error': 'Failed to update metadata'}), 500
|
||||
|
||||
logger.info("File metadata updated", file_id=file_id)
|
||||
return jsonify({'message': 'Metadata updated successfully'}), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error updating file metadata", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/files/<file_id>/transformations', methods=['GET'])
|
||||
def get_file_transformations_endpoint(file_id: str):
|
||||
"""Get transformations for a file."""
|
||||
try:
|
||||
# Check if file exists
|
||||
file_metadata = get_file_metadata(file_id)
|
||||
if not file_metadata:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
transformations = get_file_transformations(file_id)
|
||||
return jsonify(transformations), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching file transformations", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/files/<file_id>/access-logs', methods=['GET'])
|
||||
def get_file_access_logs_endpoint(file_id: str):
|
||||
"""Get access logs for a file."""
|
||||
try:
|
||||
# Check if file exists
|
||||
file_metadata = get_file_metadata(file_id)
|
||||
if not file_metadata:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
limit = request.args.get('limit', 50, type=int)
|
||||
access_logs = get_file_access_logs(file_id, limit=limit)
|
||||
return jsonify(access_logs), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching file access logs", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/stats', methods=['GET'])
|
||||
def get_stats_endpoint():
|
||||
"""Get system statistics."""
|
||||
try:
|
||||
stats = get_storage_stats()
|
||||
return jsonify(stats), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching stats", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
24
functions/metadata/requirements.txt
Normal file
24
functions/metadata/requirements.txt
Normal file
@@ -0,0 +1,24 @@
|
||||
# Core dependencies
|
||||
flask==2.3.3
|
||||
gunicorn==21.2.0
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# Database
|
||||
psycopg2-binary==2.9.7
|
||||
sqlalchemy==2.0.21
|
||||
|
||||
# MinIO/S3
|
||||
minio==7.1.17
|
||||
boto3==1.28.44
|
||||
|
||||
# HTTP requests
|
||||
requests==2.31.0
|
||||
|
||||
# JSON and data handling
|
||||
pydantic==2.1.1
|
||||
|
||||
# Logging
|
||||
structlog==23.1.0
|
||||
|
||||
# Utilities
|
||||
python-dateutil==2.8.2
|
||||
51
functions/requirements.txt
Normal file
51
functions/requirements.txt
Normal file
@@ -0,0 +1,51 @@
|
||||
# Core dependencies
|
||||
flask==2.3.3
|
||||
gunicorn==21.2.0
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# Database
|
||||
psycopg2-binary==2.9.7
|
||||
sqlalchemy==2.0.21
|
||||
alembic==1.12.0
|
||||
|
||||
# MinIO/S3
|
||||
minio==7.1.17
|
||||
boto3==1.28.44
|
||||
|
||||
# File processing
|
||||
python-magic==0.4.27
|
||||
Pillow==10.0.1
|
||||
PyPDF2==3.0.1
|
||||
python-docx==0.8.11
|
||||
openpyxl==3.1.2
|
||||
pandas==2.0.3
|
||||
numpy==1.24.3
|
||||
|
||||
# HTTP requests
|
||||
requests==2.31.0
|
||||
httpx==0.24.1
|
||||
|
||||
# JSON and data handling
|
||||
pydantic==2.1.1
|
||||
marshmallow==3.20.1
|
||||
|
||||
# Authentication and security
|
||||
PyJWT==2.8.0
|
||||
bcrypt==4.0.1
|
||||
cryptography==41.0.4
|
||||
|
||||
# Logging and monitoring
|
||||
structlog==23.1.0
|
||||
prometheus-client==0.17.1
|
||||
|
||||
# Utilities
|
||||
python-dateutil==2.8.2
|
||||
pytz==2023.3
|
||||
click==8.1.7
|
||||
|
||||
# Development and testing
|
||||
pytest==7.4.2
|
||||
pytest-cov==4.1.0
|
||||
black==23.7.0
|
||||
flake8==6.0.0
|
||||
mypy==1.5.1
|
||||
33
functions/transform/Dockerfile
Normal file
33
functions/transform/Dockerfile
Normal file
@@ -0,0 +1,33 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
libmagic1 \
|
||||
libgl1-mesa-glx \
|
||||
libglib2.0-0 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY app.py .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
|
||||
USER app
|
||||
|
||||
# Expose port
|
||||
EXPOSE 5000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:5000/health || exit 1
|
||||
|
||||
# Run the application
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "2", "--timeout", "600", "app:app"]
|
||||
473
functions/transform/app.py
Normal file
473
functions/transform/app.py
Normal file
@@ -0,0 +1,473 @@
|
||||
import os
|
||||
import uuid
|
||||
import json
|
||||
import tempfile
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional, List
|
||||
from pathlib import Path
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
import structlog
|
||||
|
||||
# File processing imports
|
||||
import PyPDF2
|
||||
from docx import Document
|
||||
import pandas as pd
|
||||
from PIL import Image
|
||||
import io
|
||||
|
||||
# Configure structured logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
structlog.processors.JSONRenderer()
|
||||
],
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Configuration
|
||||
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
|
||||
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
|
||||
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
|
||||
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
|
||||
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
|
||||
|
||||
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=MINIO_USE_SSL
|
||||
)
|
||||
|
||||
def get_db_connection():
|
||||
"""Create a database connection."""
|
||||
return psycopg2.connect(POSTGRES_URL)
|
||||
|
||||
def get_file_from_minio(object_key: str) -> bytes:
|
||||
"""Download file from MinIO."""
|
||||
try:
|
||||
response = minio_client.get_object(MINIO_BUCKET_NAME, object_key)
|
||||
return response.read()
|
||||
except S3Error as e:
|
||||
logger.error("Failed to get file from MinIO", object_key=object_key, error=str(e))
|
||||
raise
|
||||
|
||||
def upload_file_to_minio(file_data: bytes, object_key: str) -> bool:
|
||||
"""Upload file to MinIO."""
|
||||
try:
|
||||
minio_client.put_object(
|
||||
MINIO_BUCKET_NAME,
|
||||
object_key,
|
||||
file_data,
|
||||
length=len(file_data)
|
||||
)
|
||||
return True
|
||||
except S3Error as e:
|
||||
logger.error("Failed to upload file to MinIO", object_key=object_key, error=str(e))
|
||||
return False
|
||||
|
||||
def extract_text_from_pdf(file_data: bytes) -> str:
|
||||
"""Extract text from PDF file."""
|
||||
try:
|
||||
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_data))
|
||||
text = ""
|
||||
for page in pdf_reader.pages:
|
||||
text += page.extract_text() + "\n"
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
logger.error("PDF text extraction failed", error=str(e))
|
||||
raise
|
||||
|
||||
def extract_text_from_docx(file_data: bytes) -> str:
|
||||
"""Extract text from DOCX file."""
|
||||
try:
|
||||
doc = Document(io.BytesIO(file_data))
|
||||
text = ""
|
||||
for paragraph in doc.paragraphs:
|
||||
text += paragraph.text + "\n"
|
||||
return text.strip()
|
||||
except Exception as e:
|
||||
logger.error("DOCX text extraction failed", error=str(e))
|
||||
raise
|
||||
|
||||
def convert_csv_to_json(file_data: bytes) -> List[Dict[str, Any]]:
|
||||
"""Convert CSV to JSON format."""
|
||||
try:
|
||||
df = pd.read_csv(io.BytesIO(file_data))
|
||||
return df.to_dict('records')
|
||||
except Exception as e:
|
||||
logger.error("CSV to JSON conversion failed", error=str(e))
|
||||
raise
|
||||
|
||||
def convert_excel_to_json(file_data: bytes) -> List[Dict[str, Any]]:
|
||||
"""Convert Excel to JSON format."""
|
||||
try:
|
||||
df = pd.read_excel(io.BytesIO(file_data))
|
||||
return df.to_dict('records')
|
||||
except Exception as e:
|
||||
logger.error("Excel to JSON conversion failed", error=str(e))
|
||||
raise
|
||||
|
||||
def resize_image(file_data: bytes, width: int, height: int) -> bytes:
|
||||
"""Resize image to specified dimensions."""
|
||||
try:
|
||||
image = Image.open(io.BytesIO(file_data))
|
||||
resized_image = image.resize((width, height), Image.Resampling.LANCZOS)
|
||||
|
||||
output = io.BytesIO()
|
||||
resized_image.save(output, format=image.format or 'JPEG')
|
||||
return output.getvalue()
|
||||
except Exception as e:
|
||||
logger.error("Image resize failed", error=str(e))
|
||||
raise
|
||||
|
||||
def convert_image_format(file_data: bytes, target_format: str) -> bytes:
|
||||
"""Convert image to different format."""
|
||||
try:
|
||||
image = Image.open(io.BytesIO(file_data))
|
||||
|
||||
output = io.BytesIO()
|
||||
image.save(output, format=target_format.upper())
|
||||
return output.getvalue()
|
||||
except Exception as e:
|
||||
logger.error("Image format conversion failed", error=str(e))
|
||||
raise
|
||||
|
||||
def create_transformation_record(file_id: str, transformation_type: str, config: Dict[str, Any]) -> str:
|
||||
"""Create transformation record in database."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO transformations (
|
||||
file_id, transformation_type, input_path, status, config, created_at
|
||||
) VALUES (%s, %s, %s, %s, %s, %s)
|
||||
RETURNING id
|
||||
""", (
|
||||
file_id,
|
||||
transformation_type,
|
||||
f"files/{file_id}",
|
||||
'pending',
|
||||
json.dumps(config),
|
||||
datetime.utcnow()
|
||||
))
|
||||
|
||||
transformation_id = cur.fetchone()['id']
|
||||
conn.commit()
|
||||
return str(transformation_id)
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error("Failed to create transformation record", error=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def update_transformation_status(transformation_id: str, status: str, result: Optional[Dict[str, Any]] = None, error_message: Optional[str] = None):
|
||||
"""Update transformation status in database."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
if status == 'processing':
|
||||
cur.execute("""
|
||||
UPDATE transformations
|
||||
SET status = %s, started_at = %s
|
||||
WHERE id = %s
|
||||
""", (status, datetime.utcnow(), transformation_id))
|
||||
elif status == 'completed':
|
||||
cur.execute("""
|
||||
UPDATE transformations
|
||||
SET status = %s, completed_at = %s, result = %s
|
||||
WHERE id = %s
|
||||
""", (status, datetime.utcnow(), json.dumps(result), transformation_id))
|
||||
elif status == 'failed':
|
||||
cur.execute("""
|
||||
UPDATE transformations
|
||||
SET status = %s, completed_at = %s, error_message = %s
|
||||
WHERE id = %s
|
||||
""", (status, datetime.utcnow(), error_message, transformation_id))
|
||||
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error("Failed to update transformation status", error=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_file_info(file_id: str) -> Optional[Dict[str, Any]]:
|
||||
"""Get file information from database."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, filename, file_type, mime_type, object_key, status
|
||||
FROM files
|
||||
WHERE id = %s
|
||||
""", (file_id,))
|
||||
|
||||
file_record = cur.fetchone()
|
||||
return dict(file_record) if file_record else None
|
||||
except Exception as e:
|
||||
logger.error("Failed to get file info", error=str(e))
|
||||
return None
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
"""Health check endpoint."""
|
||||
return jsonify({'status': 'healthy', 'service': 'file-transform'})
|
||||
|
||||
@app.route('/transform', methods=['POST'])
|
||||
def transform_file():
|
||||
"""Handle file transformation request."""
|
||||
try:
|
||||
data = request.get_json()
|
||||
if not data:
|
||||
return jsonify({'error': 'No data provided'}), 400
|
||||
|
||||
file_id = data.get('file_id')
|
||||
transformation_type = data.get('transformation_type')
|
||||
config = data.get('config', {})
|
||||
|
||||
if not file_id or not transformation_type:
|
||||
return jsonify({'error': 'file_id and transformation_type are required'}), 400
|
||||
|
||||
# Get file information
|
||||
file_info = get_file_info(file_id)
|
||||
if not file_info:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
if file_info['status'] == 'deleted':
|
||||
return jsonify({'error': 'File has been deleted'}), 400
|
||||
|
||||
# Create transformation record
|
||||
transformation_id = create_transformation_record(file_id, transformation_type, config)
|
||||
|
||||
# Update status to processing
|
||||
update_transformation_status(transformation_id, 'processing')
|
||||
|
||||
logger.info("Starting transformation",
|
||||
file_id=file_id,
|
||||
transformation_id=transformation_id,
|
||||
transformation_type=transformation_type)
|
||||
|
||||
try:
|
||||
# Get file from MinIO
|
||||
file_data = get_file_from_minio(file_info['object_key'])
|
||||
|
||||
# Perform transformation based on type
|
||||
result = None
|
||||
output_data = None
|
||||
|
||||
if transformation_type == 'extract_text':
|
||||
if file_info['file_type'] == 'pdf':
|
||||
result = extract_text_from_pdf(file_data)
|
||||
elif file_info['file_type'] in ['docx', 'doc']:
|
||||
result = extract_text_from_docx(file_data)
|
||||
else:
|
||||
raise ValueError(f"Text extraction not supported for file type: {file_info['file_type']}")
|
||||
|
||||
# Save extracted text as new file
|
||||
output_filename = f"{Path(file_info['filename']).stem}_extracted.txt"
|
||||
output_object_key = f"transformations/{transformation_id}/{output_filename}"
|
||||
output_data = result.encode('utf-8')
|
||||
|
||||
elif transformation_type == 'csv_to_json':
|
||||
if file_info['file_type'] != 'csv':
|
||||
raise ValueError("CSV to JSON conversion only supports CSV files")
|
||||
|
||||
result = convert_csv_to_json(file_data)
|
||||
output_filename = f"{Path(file_info['filename']).stem}.json"
|
||||
output_object_key = f"transformations/{transformation_id}/{output_filename}"
|
||||
output_data = json.dumps(result, indent=2).encode('utf-8')
|
||||
|
||||
elif transformation_type == 'excel_to_json':
|
||||
if file_info['file_type'] not in ['xlsx', 'xls']:
|
||||
raise ValueError("Excel to JSON conversion only supports Excel files")
|
||||
|
||||
result = convert_excel_to_json(file_data)
|
||||
output_filename = f"{Path(file_info['filename']).stem}.json"
|
||||
output_object_key = f"transformations/{transformation_id}/{output_filename}"
|
||||
output_data = json.dumps(result, indent=2).encode('utf-8')
|
||||
|
||||
elif transformation_type == 'resize_image':
|
||||
if not file_info['mime_type'].startswith('image/'):
|
||||
raise ValueError("Image resize only supports image files")
|
||||
|
||||
width = config.get('width', 800)
|
||||
height = config.get('height', 600)
|
||||
output_data = resize_image(file_data, width, height)
|
||||
|
||||
output_filename = f"{Path(file_info['filename']).stem}_resized.{Path(file_info['filename']).suffix}"
|
||||
output_object_key = f"transformations/{transformation_id}/{output_filename}"
|
||||
|
||||
elif transformation_type == 'convert_image':
|
||||
if not file_info['mime_type'].startswith('image/'):
|
||||
raise ValueError("Image conversion only supports image files")
|
||||
|
||||
target_format = config.get('format', 'JPEG')
|
||||
output_data = convert_image_format(file_data, target_format)
|
||||
|
||||
output_filename = f"{Path(file_info['filename']).stem}.{target_format.lower()}"
|
||||
output_object_key = f"transformations/{transformation_id}/{output_filename}"
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported transformation type: {transformation_type}")
|
||||
|
||||
# Upload transformed file to MinIO
|
||||
if output_data:
|
||||
if not upload_file_to_minio(output_data, output_object_key):
|
||||
raise Exception("Failed to upload transformed file")
|
||||
|
||||
# Update transformation as completed
|
||||
update_transformation_status(transformation_id, 'completed', {
|
||||
'output_object_key': output_object_key,
|
||||
'output_filename': output_filename,
|
||||
'result': result if isinstance(result, (str, list, dict)) else None
|
||||
})
|
||||
|
||||
# Update file status
|
||||
conn = get_db_connection()
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
UPDATE files
|
||||
SET status = 'transformed', transformation_type = %s, processed_at = %s
|
||||
WHERE id = %s
|
||||
""", (transformation_type, datetime.utcnow(), file_id))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
response_data = {
|
||||
'transformation_id': transformation_id,
|
||||
'file_id': file_id,
|
||||
'transformation_type': transformation_type,
|
||||
'status': 'completed',
|
||||
'output_object_key': output_object_key,
|
||||
'output_filename': output_filename,
|
||||
'completed_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
logger.info("Transformation completed",
|
||||
transformation_id=transformation_id,
|
||||
file_id=file_id)
|
||||
|
||||
return jsonify(response_data), 200
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
logger.error("Transformation failed",
|
||||
transformation_id=transformation_id,
|
||||
file_id=file_id,
|
||||
error=error_message)
|
||||
|
||||
# Update transformation as failed
|
||||
update_transformation_status(transformation_id, 'failed', error_message=error_message)
|
||||
|
||||
return jsonify({
|
||||
'transformation_id': transformation_id,
|
||||
'file_id': file_id,
|
||||
'status': 'failed',
|
||||
'error': error_message
|
||||
}), 500
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Transform request error", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
@app.route('/transformations/<transformation_id>', methods=['GET'])
|
||||
def get_transformation_status(transformation_id: str):
|
||||
"""Get transformation status and details."""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, file_id, transformation_type, input_path, output_path,
|
||||
status, config, result, error_message, started_at, completed_at, created_at
|
||||
FROM transformations
|
||||
WHERE id = %s
|
||||
""", (transformation_id,))
|
||||
|
||||
transformation = cur.fetchone()
|
||||
if not transformation:
|
||||
return jsonify({'error': 'Transformation not found'}), 404
|
||||
|
||||
return jsonify(dict(transformation)), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching transformation", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.route('/transformations/<transformation_id>/retry', methods=['POST'])
|
||||
def retry_transformation(transformation_id: str):
|
||||
"""Retry a failed transformation."""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Get transformation details
|
||||
cur.execute("""
|
||||
SELECT file_id, transformation_type, config
|
||||
FROM transformations
|
||||
WHERE id = %s
|
||||
""", (transformation_id,))
|
||||
|
||||
transformation = cur.fetchone()
|
||||
if not transformation:
|
||||
return jsonify({'error': 'Transformation not found'}), 404
|
||||
|
||||
if transformation['status'] != 'failed':
|
||||
return jsonify({'error': 'Only failed transformations can be retried'}), 400
|
||||
|
||||
# Reset transformation status
|
||||
cur.execute("""
|
||||
UPDATE transformations
|
||||
SET status = 'pending', started_at = NULL, completed_at = NULL,
|
||||
error_message = NULL, result = NULL
|
||||
WHERE id = %s
|
||||
""", (transformation_id,))
|
||||
conn.commit()
|
||||
|
||||
# Trigger new transformation
|
||||
transform_data = {
|
||||
'file_id': transformation['file_id'],
|
||||
'transformation_type': transformation['transformation_type'],
|
||||
'config': transformation['config'] or {}
|
||||
}
|
||||
|
||||
# Call transform endpoint internally
|
||||
with app.test_client() as client:
|
||||
response = client.post('/transform', json=transform_data)
|
||||
return response.get_json(), response.status_code
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error retrying transformation", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
33
functions/transform/requirements.txt
Normal file
33
functions/transform/requirements.txt
Normal file
@@ -0,0 +1,33 @@
|
||||
# Core dependencies
|
||||
flask==2.3.3
|
||||
gunicorn==21.2.0
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# Database
|
||||
psycopg2-binary==2.9.7
|
||||
sqlalchemy==2.0.21
|
||||
|
||||
# MinIO/S3
|
||||
minio==7.1.17
|
||||
boto3==1.28.44
|
||||
|
||||
# File processing
|
||||
python-magic==0.4.27
|
||||
Pillow==10.0.1
|
||||
PyPDF2==3.0.1
|
||||
python-docx==0.8.11
|
||||
openpyxl==3.1.2
|
||||
pandas==2.0.3
|
||||
numpy==1.24.3
|
||||
|
||||
# HTTP requests
|
||||
requests==2.31.0
|
||||
|
||||
# JSON and data handling
|
||||
pydantic==2.1.1
|
||||
|
||||
# Logging
|
||||
structlog==23.1.0
|
||||
|
||||
# Utilities
|
||||
python-dateutil==2.8.2
|
||||
31
functions/upload/Dockerfile
Normal file
31
functions/upload/Dockerfile
Normal file
@@ -0,0 +1,31 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# Set working directory
|
||||
WORKDIR /app
|
||||
|
||||
# Install system dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
gcc \
|
||||
libmagic1 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Copy requirements and install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application code
|
||||
COPY app.py .
|
||||
|
||||
# Create non-root user
|
||||
RUN useradd --create-home --shell /bin/bash app && chown -R app:app /app
|
||||
USER app
|
||||
|
||||
# Expose port
|
||||
EXPOSE 5000
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
|
||||
CMD curl -f http://localhost:5000/health || exit 1
|
||||
|
||||
# Run the application
|
||||
CMD ["gunicorn", "--bind", "0.0.0.0:5000", "--workers", "4", "--timeout", "300", "app:app"]
|
||||
287
functions/upload/app.py
Normal file
287
functions/upload/app.py
Normal file
@@ -0,0 +1,287 @@
|
||||
import os
|
||||
import uuid
|
||||
import hashlib
|
||||
import magic
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from flask import Flask, request, jsonify
|
||||
from werkzeug.utils import secure_filename
|
||||
import psycopg2
|
||||
from psycopg2.extras import RealDictCursor
|
||||
from minio import Minio
|
||||
from minio.error import S3Error
|
||||
import structlog
|
||||
|
||||
# Configure structured logging
|
||||
structlog.configure(
|
||||
processors=[
|
||||
structlog.stdlib.filter_by_level,
|
||||
structlog.stdlib.add_logger_name,
|
||||
structlog.stdlib.add_log_level,
|
||||
structlog.stdlib.PositionalArgumentsFormatter(),
|
||||
structlog.processors.TimeStamper(fmt="iso"),
|
||||
structlog.processors.StackInfoRenderer(),
|
||||
structlog.processors.format_exc_info,
|
||||
structlog.processors.UnicodeDecoder(),
|
||||
structlog.processors.JSONRenderer()
|
||||
],
|
||||
context_class=dict,
|
||||
logger_factory=structlog.stdlib.LoggerFactory(),
|
||||
wrapper_class=structlog.stdlib.BoundLogger,
|
||||
cache_logger_on_first_use=True,
|
||||
)
|
||||
|
||||
logger = structlog.get_logger()
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Configuration
|
||||
MINIO_ENDPOINT = os.getenv('MINIO_ENDPOINT', 'localhost:9000')
|
||||
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY', 'minioadmin')
|
||||
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY', 'minioadmin123')
|
||||
MINIO_BUCKET_NAME = os.getenv('MINIO_BUCKET_NAME', 'file-transformer-bucket')
|
||||
MINIO_USE_SSL = os.getenv('MINIO_USE_SSL', 'false').lower() == 'true'
|
||||
|
||||
POSTGRES_URL = os.getenv('POSTGRES_URL', 'postgresql://file_user:secure_password_123@localhost:5432/file_transformer')
|
||||
|
||||
# Initialize MinIO client
|
||||
minio_client = Minio(
|
||||
MINIO_ENDPOINT,
|
||||
access_key=MINIO_ACCESS_KEY,
|
||||
secret_key=MINIO_SECRET_KEY,
|
||||
secure=MINIO_USE_SSL
|
||||
)
|
||||
|
||||
def get_db_connection():
|
||||
"""Create a database connection."""
|
||||
return psycopg2.connect(POSTGRES_URL)
|
||||
|
||||
def calculate_file_hash(file_data: bytes) -> str:
|
||||
"""Calculate SHA-256 hash of file data."""
|
||||
return hashlib.sha256(file_data).hexdigest()
|
||||
|
||||
def get_file_metadata(file_data: bytes, filename: str) -> Dict[str, Any]:
|
||||
"""Extract file metadata including MIME type and size."""
|
||||
mime_type = magic.from_buffer(file_data, mime=True)
|
||||
file_size = len(file_data)
|
||||
|
||||
# Determine file type from extension
|
||||
file_extension = filename.rsplit('.', 1)[1].lower() if '.' in filename else ''
|
||||
|
||||
return {
|
||||
'mime_type': mime_type,
|
||||
'file_size': file_size,
|
||||
'file_type': file_extension,
|
||||
'checksum': calculate_file_hash(file_data)
|
||||
}
|
||||
|
||||
def save_file_to_database(file_data: bytes, filename: str, object_key: str, metadata: Dict[str, Any]) -> str:
|
||||
"""Save file information to PostgreSQL database."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Insert file record
|
||||
cur.execute("""
|
||||
INSERT INTO files (
|
||||
filename, original_filename, file_path, file_size,
|
||||
file_type, mime_type, bucket_name, object_key,
|
||||
checksum, status, created_at
|
||||
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
RETURNING id
|
||||
""", (
|
||||
filename,
|
||||
filename,
|
||||
object_key,
|
||||
metadata['file_size'],
|
||||
metadata['file_type'],
|
||||
metadata['mime_type'],
|
||||
MINIO_BUCKET_NAME,
|
||||
object_key,
|
||||
metadata['checksum'],
|
||||
'uploaded',
|
||||
datetime.utcnow()
|
||||
))
|
||||
|
||||
file_id = cur.fetchone()['id']
|
||||
conn.commit()
|
||||
return str(file_id)
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
logger.error("Database error", error=str(e))
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def upload_to_minio(file_data: bytes, object_key: str) -> bool:
|
||||
"""Upload file to MinIO bucket."""
|
||||
try:
|
||||
# Ensure bucket exists
|
||||
if not minio_client.bucket_exists(MINIO_BUCKET_NAME):
|
||||
minio_client.make_bucket(MINIO_BUCKET_NAME)
|
||||
logger.info("Created bucket", bucket=MINIO_BUCKET_NAME)
|
||||
|
||||
# Upload file
|
||||
minio_client.put_object(
|
||||
MINIO_BUCKET_NAME,
|
||||
object_key,
|
||||
file_data,
|
||||
length=len(file_data)
|
||||
)
|
||||
|
||||
logger.info("File uploaded to MinIO", bucket=MINIO_BUCKET_NAME, object_key=object_key)
|
||||
return True
|
||||
except S3Error as e:
|
||||
logger.error("MinIO upload error", error=str(e))
|
||||
return False
|
||||
|
||||
@app.route('/health', methods=['GET'])
|
||||
def health_check():
|
||||
"""Health check endpoint."""
|
||||
return jsonify({'status': 'healthy', 'service': 'file-upload'})
|
||||
|
||||
@app.route('/upload', methods=['POST'])
|
||||
def upload_file():
|
||||
"""Handle file upload request."""
|
||||
try:
|
||||
# Check if file is present in request
|
||||
if 'file' not in request.files:
|
||||
return jsonify({'error': 'No file provided'}), 400
|
||||
|
||||
file = request.files['file']
|
||||
if file.filename == '':
|
||||
return jsonify({'error': 'No file selected'}), 400
|
||||
|
||||
# Read file data
|
||||
file_data = file.read()
|
||||
if not file_data:
|
||||
return jsonify({'error': 'Empty file'}), 400
|
||||
|
||||
# Secure filename and generate object key
|
||||
filename = secure_filename(file.filename)
|
||||
file_id = str(uuid.uuid4())
|
||||
object_key = f"uploads/{file_id}/{filename}"
|
||||
|
||||
# Extract metadata
|
||||
metadata = get_file_metadata(file_data, filename)
|
||||
|
||||
logger.info("Processing file upload",
|
||||
filename=filename,
|
||||
size=metadata['file_size'],
|
||||
mime_type=metadata['mime_type'])
|
||||
|
||||
# Upload to MinIO
|
||||
if not upload_to_minio(file_data, object_key):
|
||||
return jsonify({'error': 'Failed to upload file to storage'}), 500
|
||||
|
||||
# Save to database
|
||||
db_file_id = save_file_to_database(file_data, filename, object_key, metadata)
|
||||
|
||||
# Log access
|
||||
log_file_access(db_file_id, 'upload', request.remote_addr, request.headers.get('User-Agent'))
|
||||
|
||||
response_data = {
|
||||
'file_id': db_file_id,
|
||||
'filename': filename,
|
||||
'object_key': object_key,
|
||||
'file_size': metadata['file_size'],
|
||||
'mime_type': metadata['mime_type'],
|
||||
'checksum': metadata['checksum'],
|
||||
'status': 'uploaded',
|
||||
'uploaded_at': datetime.utcnow().isoformat()
|
||||
}
|
||||
|
||||
logger.info("File upload completed", file_id=db_file_id, filename=filename)
|
||||
return jsonify(response_data), 201
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Upload error", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
|
||||
def log_file_access(file_id: str, action: str, ip_address: str, user_agent: Optional[str]):
|
||||
"""Log file access for audit purposes."""
|
||||
conn = get_db_connection()
|
||||
try:
|
||||
with conn.cursor() as cur:
|
||||
cur.execute("""
|
||||
INSERT INTO file_access_logs (file_id, action, ip_address, user_agent)
|
||||
VALUES (%s, %s, %s, %s)
|
||||
""", (file_id, action, ip_address, user_agent))
|
||||
conn.commit()
|
||||
except Exception as e:
|
||||
logger.error("Failed to log file access", error=str(e))
|
||||
conn.rollback()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.route('/files/<file_id>', methods=['GET'])
|
||||
def get_file_info(file_id: str):
|
||||
"""Get file information by ID."""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
cur.execute("""
|
||||
SELECT id, filename, original_filename, file_size, file_type,
|
||||
mime_type, bucket_name, object_key, checksum, status,
|
||||
created_at, updated_at
|
||||
FROM files
|
||||
WHERE id = %s
|
||||
""", (file_id,))
|
||||
|
||||
file_record = cur.fetchone()
|
||||
if not file_record:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
return jsonify(dict(file_record)), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error fetching file info", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@app.route('/files/<file_id>', methods=['DELETE'])
|
||||
def delete_file(file_id: str):
|
||||
"""Delete file from storage and database."""
|
||||
try:
|
||||
conn = get_db_connection()
|
||||
with conn.cursor(cursor_factory=RealDictCursor) as cur:
|
||||
# Get file info
|
||||
cur.execute("SELECT object_key FROM files WHERE id = %s", (file_id,))
|
||||
file_record = cur.fetchone()
|
||||
|
||||
if not file_record:
|
||||
return jsonify({'error': 'File not found'}), 404
|
||||
|
||||
object_key = file_record['object_key']
|
||||
|
||||
# Delete from MinIO
|
||||
try:
|
||||
minio_client.remove_object(MINIO_BUCKET_NAME, object_key)
|
||||
logger.info("File deleted from MinIO", object_key=object_key)
|
||||
except S3Error as e:
|
||||
logger.warning("File not found in MinIO", object_key=object_key, error=str(e))
|
||||
|
||||
# Mark as deleted in database
|
||||
cur.execute("""
|
||||
UPDATE files
|
||||
SET status = 'deleted', deleted_at = %s
|
||||
WHERE id = %s
|
||||
""", (datetime.utcnow(), file_id))
|
||||
|
||||
conn.commit()
|
||||
|
||||
# Log access
|
||||
log_file_access(file_id, 'delete', request.remote_addr, request.headers.get('User-Agent'))
|
||||
|
||||
return jsonify({'message': 'File deleted successfully'}), 200
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error deleting file", error=str(e))
|
||||
return jsonify({'error': 'Internal server error'}), 500
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=5000, debug=False)
|
||||
27
functions/upload/requirements.txt
Normal file
27
functions/upload/requirements.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
# Core dependencies
|
||||
flask==2.3.3
|
||||
gunicorn==21.2.0
|
||||
python-dotenv==1.0.0
|
||||
|
||||
# Database
|
||||
psycopg2-binary==2.9.7
|
||||
sqlalchemy==2.0.21
|
||||
|
||||
# MinIO/S3
|
||||
minio==7.1.17
|
||||
boto3==1.28.44
|
||||
|
||||
# File processing
|
||||
python-magic==0.4.27
|
||||
|
||||
# HTTP requests
|
||||
requests==2.31.0
|
||||
|
||||
# JSON and data handling
|
||||
pydantic==2.1.1
|
||||
|
||||
# Logging
|
||||
structlog==23.1.0
|
||||
|
||||
# Utilities
|
||||
python-dateutil==2.8.2
|
||||
Reference in New Issue
Block a user