Essential tools for developing, optimizing, and deploying AI models on edge devices.
Model Development Frameworks
TensorFlow Ecosystem
| Tool |
Purpose |
Platform Support |
| TensorFlow Lite |
Mobile/Edge inference |
Android, iOS, Linux, MCU |
| TensorFlow.js |
Browser/Node.js deployment |
Web browsers, Node.js |
| TensorFlow Micro |
Microcontroller deployment |
Arduino, ESP32, Cortex-M |
| Model Optimization Toolkit |
Model compression |
All TensorFlow platforms |
# TensorFlow Lite conversion pipeline
import tensorflow as tf
def convert_to_tflite(saved_model_path, optimization_level='default'):
"""Convert TensorFlow model to TensorFlow Lite"""
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
if optimization_level == 'size':
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_types = [tf.float16]
elif optimization_level == 'speed':
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
tflite_model = converter.convert()
return tflite_model
# Usage example
model_path = 'saved_model/'
tflite_model = convert_to_tflite(model_path, 'size')
with open('model.tflite', 'wb') as f:
f.write(tflite_model)
PyTorch Mobile
# PyTorch Mobile workflow
pip install torch torchvision
# Convert model to mobile format
python -m torch.utils.mobile_optimizer \
--model model.pt \
--output model_mobile.pt \
--optimization_level 2
| Tool |
Function |
Command |
| JetPack SDK |
Complete development environment |
sudo apt install nvidia-jetpack |
| TensorRT |
Inference optimization |
trtexec --onnx=model.onnx --saveEngine=model.trt |
| Nsight Systems |
Performance profiling |
nsys profile python inference.py |
| tegrastats |
System monitoring |
tegrastats --interval 1000 |
Intel OpenVINO
# OpenVINO model optimization
from openvino.tools import mo
# Convert ONNX to OpenVINO IR
mo_args = {
'input_model': 'model.onnx',
'output_dir': './openvino_model',
'data_type': 'FP16',
'mean_values': [123.675, 116.28, 103.53],
'scale_values': [58.395, 57.12, 57.375]
}
# Model Optimizer execution
mo.convert_model(**mo_args)
ONNX Ecosystem
| Tool |
Purpose |
Supported Formats |
| ONNX Runtime |
Cross-platform inference |
ONNX, PyTorch, TensorFlow |
| ONNX Simplifier |
Model optimization |
ONNX |
| Netron |
Model visualization |
ONNX, TensorFlow, PyTorch |
# ONNX model deployment
import onnxruntime as ort
def create_onnx_session(model_path, providers=['CPUExecutionProvider']):
"""Create optimized ONNX Runtime session"""
session_options = ort.SessionOptions()
session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
session = ort.InferenceSession(
model_path,
sess_options=session_options,
providers=providers
)
return session
Benchmarking and Profiling
# Model benchmarking suite
import time
import psutil
import numpy as np
class EdgeModelBenchmark:
def __init__(self, model_path, input_shape):
self.model = self.load_model(model_path)
self.input_shape = input_shape
def benchmark_inference(self, num_runs=100):
"""Comprehensive inference benchmarking"""
# Warm-up runs
dummy_input = np.random.random(self.input_shape).astype(np.float32)
for _ in range(10):
_ = self.model.predict(dummy_input)
# Benchmark runs
latencies = []
memory_usage = []
for _ in range(num_runs):
# Memory before inference
mem_before = psutil.Process().memory_info().rss / 1024 / 1024
# Time inference
start_time = time.perf_counter()
result = self.model.predict(dummy_input)
end_time = time.perf_counter()
# Memory after inference
mem_after = psutil.Process().memory_info().rss / 1024 / 1024
latencies.append((end_time - start_time) * 1000) # Convert to ms
memory_usage.append(mem_after - mem_before)
return {
'avg_latency_ms': np.mean(latencies),
'p95_latency_ms': np.percentile(latencies, 95),
'p99_latency_ms': np.percentile(latencies, 99),
'throughput_fps': 1000 / np.mean(latencies),
'memory_usage_mb': np.mean(memory_usage)
}
# Benchmark results example
benchmark_results = {
'mobilenet_v2': {
'jetson_nano': {'latency': '23ms', 'throughput': '43 FPS', 'memory': '45MB'},
'raspberry_pi4': {'latency': '89ms', 'throughput': '11 FPS', 'memory': '78MB'},
'coral_tpu': {'latency': '2.5ms', 'throughput': '400 FPS', 'memory': '12MB'}
}
}
Deployment and MLOps
Container Solutions
# Multi-stage Docker build for edge deployment
FROM python:3.9-slim as builder
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
FROM python:3.9-slim as runtime
# Install only runtime dependencies
COPY --from=builder /usr/local/lib/python3.9/site-packages /usr/local/lib/python3.9/site-packages
COPY model.tflite app.py ./
# Optimize for edge deployment
ENV PYTHONUNBUFFERED=1
ENV PYTHONDONTWRITEBYTECODE=1
EXPOSE 8080
CMD ["python", "app.py"]
Edge Orchestration
# Kubernetes deployment for edge
apiVersion: apps/v1
kind: Deployment
metadata:
name: edge-ai-inference
spec:
replicas: 1
selector:
matchLabels:
app: edge-ai
template:
metadata:
labels:
app: edge-ai
spec:
containers:
- name: inference-server
image: edge-ai:latest
resources:
limits:
memory: "512Mi"
cpu: "500m"
requests:
memory: "256Mi"
cpu: "250m"
ports:
- containerPort: 8080
| Tool |
Framework |
Quantization Types |
Ease of Use |
| TensorFlow Model Optimization |
TensorFlow |
INT8, FP16 |
High |
| Intel Neural Compressor |
Multi-framework |
INT8, INT4 |
Medium |
| NVIDIA TensorRT |
ONNX, TensorFlow |
INT8, FP16 |
Medium |
| PyTorch Quantization |
PyTorch |
Dynamic, Static |
High |
# Intel Neural Compressor example
from neural_compressor import Quantization
def quantize_model_inc(model_path, dataset):
"""Quantize model using Intel Neural Compressor"""
config = {
'model': model_path,
'approach': 'post_training_static_quant',
'accuracy_criterion': {'relative': 0.01}, # 1% accuracy loss tolerance
'exit_policy': {'timeout': 3600} # 1 hour timeout
}
quantizer = Quantization(config)
quantizer.calib_dataloader = dataset
quantized_model = quantizer.fit()
return quantized_model
Monitoring and Debugging
Edge AI Monitoring
# Real-time monitoring system
import logging
import json
from datetime import datetime
class EdgeAIMonitor:
def __init__(self, model_name):
self.model_name = model_name
self.metrics = {
'inference_count': 0,
'error_count': 0,
'avg_latency': 0,
'last_update': datetime.now()
}
def log_inference(self, latency, success=True):
"""Log inference metrics"""
self.metrics['inference_count'] += 1
if success:
# Update average latency
current_avg = self.metrics['avg_latency']
count = self.metrics['inference_count']
self.metrics['avg_latency'] = (current_avg * (count - 1) + latency) / count
else:
self.metrics['error_count'] += 1
self.metrics['last_update'] = datetime.now()
# Log to file
log_entry = {
'timestamp': datetime.now().isoformat(),
'model': self.model_name,
'latency_ms': latency,
'success': success,
'total_inferences': self.metrics['inference_count']
}
logging.info(json.dumps(log_entry))
def get_health_status(self):
"""Get current health status"""
error_rate = self.metrics['error_count'] / max(self.metrics['inference_count'], 1)
if error_rate > 0.1:
status = 'UNHEALTHY'
elif self.metrics['avg_latency'] > 100: # 100ms threshold
status = 'DEGRADED'
else:
status = 'HEALTHY'
return {
'status': status,
'metrics': self.metrics,
'error_rate': error_rate
}
CI/CD for Edge AI
# GitHub Actions workflow for EdgeAI
name: EdgeAI CI/CD
on:
push:
branches: [main]
pull_request:
branches: [main]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.9'
- name: Install dependencies
run: |
pip install -r requirements.txt
pip install pytest
- name: Run tests
run: pytest tests/
- name: Model validation
run: python validate_model.py
- name: Convert to TensorFlow Lite
run: python convert_to_tflite.py
- name: Benchmark on edge devices
run: python benchmark_edge.py
Model Versioning
# MLflow integration for edge models
import mlflow
import mlflow.tensorflow
class EdgeModelRegistry:
def __init__(self, tracking_uri):
mlflow.set_tracking_uri(tracking_uri)
def log_edge_model(self, model, metrics, model_name):
"""Log model with edge-specific metrics"""
with mlflow.start_run():
# Log standard metrics
for key, value in metrics.items():
mlflow.log_metric(key, value)
# Log edge-specific metrics
mlflow.log_metric("model_size_mb", self.get_model_size(model))
mlflow.log_metric("inference_latency_ms", metrics.get('latency', 0))
mlflow.log_metric("memory_usage_mb", metrics.get('memory', 0))
# Log model
mlflow.tensorflow.log_model(
model,
"model",
registered_model_name=model_name
)
def deploy_to_edge(self, model_name, version, edge_devices):
"""Deploy model to edge devices"""
model_uri = f"models:/{model_name}/{version}"
for device in edge_devices:
# Download and deploy model
self.push_to_device(model_uri, device)
Development Stacks
| Use Case |
Framework |
Optimization |
Deployment |
Monitoring |
| Mobile Apps |
TensorFlow Lite |
Model Optimization Toolkit |
Android/iOS |
Firebase |
| IoT Devices |
TensorFlow Micro |
Quantization |
FreeRTOS |
Custom logging |
| Edge Servers |
ONNX Runtime |
TensorRT |
Docker/K8s |
Prometheus |
| Automotive |
PyTorch Mobile |
Quantization |
QNX/Linux |
CAN bus |
Next: Benchmarks - Performance metrics and comparison studies.