EdgeAI Datasets
Datasets specifically designed for edge AI development, benchmarking, and evaluation across various domains and hardware platforms.
Computer Vision Datasets
Image Classification
| Dataset |
Images |
Classes |
Size |
Use Case |
| ImageNet |
14M |
1000 |
150GB |
General classification |
| CIFAR-10 |
60K |
10 |
163MB |
Lightweight models |
| CIFAR-100 |
60K |
100 |
163MB |
Fine-grained classification |
| MobileNet ImageNet |
1.2M |
1000 |
50GB |
Mobile optimization |
# Loading edge-optimized datasets
import tensorflow_datasets as tfds
def load_edge_dataset(dataset_name, split='train', batch_size=32):
"""Load dataset optimized for edge training"""
dataset = tfds.load(
dataset_name,
split=split,
as_supervised=True,
batch_size=batch_size
)
# Edge-specific preprocessing
def preprocess(image, label):
image = tf.cast(image, tf.float32) / 255.0
image = tf.image.resize(image, [224, 224]) # Standard mobile size
return image, label
return dataset.map(preprocess)
# Edge dataset characteristics
edge_datasets = {
'cifar10': {'memory_footprint': '163MB', 'training_time': '2 hours', 'edge_friendly': True},
'imagenet_mobile': {'memory_footprint': '50GB', 'training_time': '24 hours', 'edge_friendly': False},
'flowers102': {'memory_footprint': '330MB', 'training_time': '1 hour', 'edge_friendly': True}
}
Object Detection
| Dataset |
Images |
Objects |
Annotations |
Domain |
| COCO |
330K |
80 classes |
2.5M |
General objects |
| Open Images |
9M |
600 classes |
36M |
Web images |
| Pascal VOC |
20K |
20 classes |
50K |
Standard benchmark |
| Edge Detection Dataset |
100K |
10 classes |
500K |
Edge-optimized |
Sensor and IoT Datasets
Time Series Data
# IoT sensor dataset example
class EdgeIoTDataset:
def __init__(self, sensor_type='accelerometer'):
self.datasets = {
'accelerometer': {
'samples': 50000,
'features': 3, # x, y, z axes
'sampling_rate': '100Hz',
'applications': ['Activity recognition', 'Fall detection']
},
'temperature': {
'samples': 100000,
'features': 1,
'sampling_rate': '1Hz',
'applications': ['HVAC control', 'Predictive maintenance']
},
'vibration': {
'samples': 200000,
'features': 3,
'sampling_rate': '1kHz',
'applications': ['Machine health', 'Fault detection']
}
}
def get_dataset_info(self, sensor_type):
return self.datasets.get(sensor_type, {})
# Popular IoT datasets
iot_datasets = {
'UCI_HAR': {'size': '25MB', 'activities': 6, 'subjects': 30, 'sensors': 'Accelerometer, Gyroscope'},
'OPPORTUNITY': {'size': '2.8GB', 'activities': 18, 'subjects': 4, 'sensors': 'Multi-modal'},
'PAMAP2': {'size': '1.2GB', 'activities': 18, 'subjects': 9, 'sensors': 'IMU, Heart rate'}
}
Edge-Specific Benchmarks
MLPerf Mobile
| Model |
Task |
Dataset |
Metric |
Target Hardware |
| MobileNet v1 |
Classification |
ImageNet |
Top-1 Accuracy |
Mobile devices |
| SSD MobileNet |
Detection |
COCO |
mAP |
Edge devices |
| DeepLab v3 |
Segmentation |
ADE20K |
mIoU |
Embedded systems |
| RNN-T |
Speech |
LibriSpeech |
WER |
Voice assistants |
# MLPerf mobile benchmark
def run_mlperf_benchmark(model_path, dataset_path, device='cpu'):
"""Run MLPerf mobile benchmark"""
benchmark_config = {
'model_path': model_path,
'dataset_path': dataset_path,
'batch_size': 1, # Single inference for mobile
'num_threads': 4,
'warmup_runs': 10,
'benchmark_runs': 100
}
results = {
'avg_latency_ms': 0,
'throughput_fps': 0,
'accuracy': 0,
'memory_usage_mb': 0,
'power_consumption_w': 0
}
return results
# MLPerf results (example)
mlperf_results = {
'pixel_6': {
'mobilenet_v1': {'latency': '12ms', 'accuracy': '71.8%', 'power': '2.1W'},
'ssd_mobilenet': {'latency': '45ms', 'mAP': '23.2%', 'power': '3.2W'}
},
'jetson_nano': {
'mobilenet_v1': {'latency': '23ms', 'accuracy': '71.8%', 'power': '5.2W'},
'ssd_mobilenet': {'latency': '89ms', 'mAP': '23.2%', 'power': '6.8W'}
}
}
Domain-Specific Datasets
Healthcare
| Dataset |
Type |
Size |
Samples |
Application |
| ChestX-ray14 |
Medical imaging |
45GB |
112K |
Pneumonia detection |
| MIMIC-III |
Clinical data |
6GB |
58K patients |
Patient monitoring |
| PhysioNet |
Physiological signals |
2TB |
Various |
Wearable devices |
| Skin Cancer MNIST |
Dermatology |
600MB |
10K |
Mobile diagnosis |
Manufacturing
# Manufacturing dataset example
manufacturing_datasets = {
'bearing_fault': {
'description': 'Bearing vibration data for fault detection',
'samples': 100000,
'features': ['vibration_x', 'vibration_y', 'vibration_z', 'temperature'],
'labels': ['normal', 'inner_race_fault', 'outer_race_fault', 'ball_fault'],
'sampling_rate': '12kHz',
'file_size': '2.3GB'
},
'steel_defect': {
'description': 'Steel surface defect images',
'samples': 12600,
'image_size': '200x200',
'defect_types': 6,
'file_size': '1.1GB'
}
}
Synthetic Datasets
Data Generation for Edge
import numpy as np
from sklearn.datasets import make_classification
def generate_edge_dataset(n_samples=10000, n_features=10, n_classes=2):
"""Generate synthetic dataset for edge AI development"""
X, y = make_classification(
n_samples=n_samples,
n_features=n_features,
n_classes=n_classes,
n_redundant=0,
n_informative=n_features,
random_state=42
)
# Add noise to simulate real-world conditions
noise_level = 0.1
X += np.random.normal(0, noise_level, X.shape)
return X, y
# Synthetic data advantages for edge
synthetic_benefits = {
'privacy_preservation': 'No real user data exposed',
'data_augmentation': 'Unlimited training samples',
'edge_case_simulation': 'Test rare scenarios',
'cost_effectiveness': 'No data collection costs',
'rapid_prototyping': 'Quick model development'
}
Dataset Optimization for Edge
Data Preprocessing
class EdgeDataPreprocessor:
def __init__(self, target_size=(224, 224)):
self.target_size = target_size
def optimize_for_edge(self, dataset):
"""Optimize dataset for edge deployment"""
optimizations = {
'resize_images': self.target_size,
'normalize_values': True,
'reduce_precision': 'float16',
'compress_format': 'jpeg_90',
'batch_processing': True
}
# Apply optimizations
processed_dataset = self.apply_optimizations(dataset, optimizations)
return processed_dataset
def calculate_memory_footprint(self, dataset_size, image_dims, precision='float32'):
"""Calculate memory requirements"""
precision_bytes = {'float32': 4, 'float16': 2, 'int8': 1}
bytes_per_sample = np.prod(image_dims) * precision_bytes[precision]
total_memory_mb = (dataset_size * bytes_per_sample) / (1024 * 1024)
return total_memory_mb
# Memory optimization results
optimization_results = {
'original_float32': {'size': '150GB', 'memory': '2.4GB', 'loading_time': '45s'},
'optimized_float16': {'size': '75GB', 'memory': '1.2GB', 'loading_time': '23s'},
'quantized_int8': {'size': '37.5GB', 'memory': '600MB', 'loading_time': '12s'}
}
Federated Learning Datasets
Distributed Data Scenarios
| Dataset |
Participants |
Data Distribution |
Privacy Level |
Use Case |
| FEMNIST |
3,550 |
Non-IID |
High |
Handwriting recognition |
| CIFAR-10 FL |
100 |
IID/Non-IID |
Medium |
Image classification |
| Shakespeare |
715 |
Natural |
High |
Language modeling |
| Synthetic FL |
Configurable |
Customizable |
Variable |
Research |
# Federated dataset simulation
class FederatedDatasetSimulator:
def __init__(self, base_dataset, num_clients=10):
self.base_dataset = base_dataset
self.num_clients = num_clients
def create_non_iid_split(self, alpha=0.5):
"""Create non-IID data distribution using Dirichlet distribution"""
# Simulate realistic federated scenarios
client_data = {}
for client_id in range(self.num_clients):
# Assign data based on Dirichlet distribution
client_samples = self.sample_client_data(alpha)
client_data[f'client_{client_id}'] = {
'data_size': len(client_samples),
'class_distribution': self.get_class_distribution(client_samples),
'data_quality': np.random.uniform(0.8, 1.0) # Simulate varying quality
}
return client_data
# Federated learning challenges
federated_challenges = {
'data_heterogeneity': 'Non-IID data distribution across clients',
'system_heterogeneity': 'Varying computational capabilities',
'statistical_heterogeneity': 'Different local data distributions',
'communication_constraints': 'Limited bandwidth and intermittent connectivity'
}
Evaluation Metrics
Edge-Specific Metrics
def evaluate_edge_model(model, test_dataset, hardware_constraints):
"""Comprehensive evaluation for edge deployment"""
metrics = {
'accuracy': calculate_accuracy(model, test_dataset),
'latency_ms': measure_inference_time(model),
'memory_usage_mb': measure_memory_footprint(model),
'energy_consumption_mj': measure_energy_usage(model),
'model_size_mb': get_model_size(model),
'throughput_fps': calculate_throughput(model)
}
# Edge deployment score
edge_score = calculate_edge_deployment_score(metrics, hardware_constraints)
metrics['edge_deployment_score'] = edge_score
return metrics
# Benchmark comparison
benchmark_comparison = {
'cloud_model': {'accuracy': '94.2%', 'latency': '150ms', 'size': '250MB'},
'edge_optimized': {'accuracy': '91.8%', 'latency': '25ms', 'size': '15MB'},
'ultra_lightweight': {'accuracy': '87.3%', 'latency': '8ms', 'size': '2MB'}
}
Next: Tools - Development and deployment tools for EdgeAI.