Skip to content

EdgeAI Datasets

Datasets specifically designed for edge AI development, benchmarking, and evaluation across various domains and hardware platforms.

Computer Vision Datasets

Image Classification

Dataset Images Classes Size Use Case
ImageNet 14M 1000 150GB General classification
CIFAR-10 60K 10 163MB Lightweight models
CIFAR-100 60K 100 163MB Fine-grained classification
MobileNet ImageNet 1.2M 1000 50GB Mobile optimization
# Loading edge-optimized datasets
import tensorflow_datasets as tfds

def load_edge_dataset(dataset_name, split='train', batch_size=32):
    """Load dataset optimized for edge training"""

    dataset = tfds.load(
        dataset_name,
        split=split,
        as_supervised=True,
        batch_size=batch_size
    )

    # Edge-specific preprocessing
    def preprocess(image, label):
        image = tf.cast(image, tf.float32) / 255.0
        image = tf.image.resize(image, [224, 224])  # Standard mobile size
        return image, label

    return dataset.map(preprocess)

# Edge dataset characteristics
edge_datasets = {
    'cifar10': {'memory_footprint': '163MB', 'training_time': '2 hours', 'edge_friendly': True},
    'imagenet_mobile': {'memory_footprint': '50GB', 'training_time': '24 hours', 'edge_friendly': False},
    'flowers102': {'memory_footprint': '330MB', 'training_time': '1 hour', 'edge_friendly': True}
}

Object Detection

Dataset Images Objects Annotations Domain
COCO 330K 80 classes 2.5M General objects
Open Images 9M 600 classes 36M Web images
Pascal VOC 20K 20 classes 50K Standard benchmark
Edge Detection Dataset 100K 10 classes 500K Edge-optimized

Sensor and IoT Datasets

Time Series Data

# IoT sensor dataset example
class EdgeIoTDataset:
    def __init__(self, sensor_type='accelerometer'):
        self.datasets = {
            'accelerometer': {
                'samples': 50000,
                'features': 3,  # x, y, z axes
                'sampling_rate': '100Hz',
                'applications': ['Activity recognition', 'Fall detection']
            },
            'temperature': {
                'samples': 100000,
                'features': 1,
                'sampling_rate': '1Hz',
                'applications': ['HVAC control', 'Predictive maintenance']
            },
            'vibration': {
                'samples': 200000,
                'features': 3,
                'sampling_rate': '1kHz',
                'applications': ['Machine health', 'Fault detection']
            }
        }

    def get_dataset_info(self, sensor_type):
        return self.datasets.get(sensor_type, {})

# Popular IoT datasets
iot_datasets = {
    'UCI_HAR': {'size': '25MB', 'activities': 6, 'subjects': 30, 'sensors': 'Accelerometer, Gyroscope'},
    'OPPORTUNITY': {'size': '2.8GB', 'activities': 18, 'subjects': 4, 'sensors': 'Multi-modal'},
    'PAMAP2': {'size': '1.2GB', 'activities': 18, 'subjects': 9, 'sensors': 'IMU, Heart rate'}
}

Edge-Specific Benchmarks

MLPerf Mobile

Model Task Dataset Metric Target Hardware
MobileNet v1 Classification ImageNet Top-1 Accuracy Mobile devices
SSD MobileNet Detection COCO mAP Edge devices
DeepLab v3 Segmentation ADE20K mIoU Embedded systems
RNN-T Speech LibriSpeech WER Voice assistants
# MLPerf mobile benchmark
def run_mlperf_benchmark(model_path, dataset_path, device='cpu'):
    """Run MLPerf mobile benchmark"""

    benchmark_config = {
        'model_path': model_path,
        'dataset_path': dataset_path,
        'batch_size': 1,  # Single inference for mobile
        'num_threads': 4,
        'warmup_runs': 10,
        'benchmark_runs': 100
    }

    results = {
        'avg_latency_ms': 0,
        'throughput_fps': 0,
        'accuracy': 0,
        'memory_usage_mb': 0,
        'power_consumption_w': 0
    }

    return results

# MLPerf results (example)
mlperf_results = {
    'pixel_6': {
        'mobilenet_v1': {'latency': '12ms', 'accuracy': '71.8%', 'power': '2.1W'},
        'ssd_mobilenet': {'latency': '45ms', 'mAP': '23.2%', 'power': '3.2W'}
    },
    'jetson_nano': {
        'mobilenet_v1': {'latency': '23ms', 'accuracy': '71.8%', 'power': '5.2W'},
        'ssd_mobilenet': {'latency': '89ms', 'mAP': '23.2%', 'power': '6.8W'}
    }
}

Domain-Specific Datasets

Healthcare

Dataset Type Size Samples Application
ChestX-ray14 Medical imaging 45GB 112K Pneumonia detection
MIMIC-III Clinical data 6GB 58K patients Patient monitoring
PhysioNet Physiological signals 2TB Various Wearable devices
Skin Cancer MNIST Dermatology 600MB 10K Mobile diagnosis

Manufacturing

# Manufacturing dataset example
manufacturing_datasets = {
    'bearing_fault': {
        'description': 'Bearing vibration data for fault detection',
        'samples': 100000,
        'features': ['vibration_x', 'vibration_y', 'vibration_z', 'temperature'],
        'labels': ['normal', 'inner_race_fault', 'outer_race_fault', 'ball_fault'],
        'sampling_rate': '12kHz',
        'file_size': '2.3GB'
    },
    'steel_defect': {
        'description': 'Steel surface defect images',
        'samples': 12600,
        'image_size': '200x200',
        'defect_types': 6,
        'file_size': '1.1GB'
    }
}

Synthetic Datasets

Data Generation for Edge

import numpy as np
from sklearn.datasets import make_classification

def generate_edge_dataset(n_samples=10000, n_features=10, n_classes=2):
    """Generate synthetic dataset for edge AI development"""

    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_classes=n_classes,
        n_redundant=0,
        n_informative=n_features,
        random_state=42
    )

    # Add noise to simulate real-world conditions
    noise_level = 0.1
    X += np.random.normal(0, noise_level, X.shape)

    return X, y

# Synthetic data advantages for edge
synthetic_benefits = {
    'privacy_preservation': 'No real user data exposed',
    'data_augmentation': 'Unlimited training samples',
    'edge_case_simulation': 'Test rare scenarios',
    'cost_effectiveness': 'No data collection costs',
    'rapid_prototyping': 'Quick model development'
}

Dataset Optimization for Edge

Data Preprocessing

class EdgeDataPreprocessor:
    def __init__(self, target_size=(224, 224)):
        self.target_size = target_size

    def optimize_for_edge(self, dataset):
        """Optimize dataset for edge deployment"""

        optimizations = {
            'resize_images': self.target_size,
            'normalize_values': True,
            'reduce_precision': 'float16',
            'compress_format': 'jpeg_90',
            'batch_processing': True
        }

        # Apply optimizations
        processed_dataset = self.apply_optimizations(dataset, optimizations)

        return processed_dataset

    def calculate_memory_footprint(self, dataset_size, image_dims, precision='float32'):
        """Calculate memory requirements"""

        precision_bytes = {'float32': 4, 'float16': 2, 'int8': 1}
        bytes_per_sample = np.prod(image_dims) * precision_bytes[precision]
        total_memory_mb = (dataset_size * bytes_per_sample) / (1024 * 1024)

        return total_memory_mb

# Memory optimization results
optimization_results = {
    'original_float32': {'size': '150GB', 'memory': '2.4GB', 'loading_time': '45s'},
    'optimized_float16': {'size': '75GB', 'memory': '1.2GB', 'loading_time': '23s'},
    'quantized_int8': {'size': '37.5GB', 'memory': '600MB', 'loading_time': '12s'}
}

Federated Learning Datasets

Distributed Data Scenarios

Dataset Participants Data Distribution Privacy Level Use Case
FEMNIST 3,550 Non-IID High Handwriting recognition
CIFAR-10 FL 100 IID/Non-IID Medium Image classification
Shakespeare 715 Natural High Language modeling
Synthetic FL Configurable Customizable Variable Research
# Federated dataset simulation
class FederatedDatasetSimulator:
    def __init__(self, base_dataset, num_clients=10):
        self.base_dataset = base_dataset
        self.num_clients = num_clients

    def create_non_iid_split(self, alpha=0.5):
        """Create non-IID data distribution using Dirichlet distribution"""

        # Simulate realistic federated scenarios
        client_data = {}

        for client_id in range(self.num_clients):
            # Assign data based on Dirichlet distribution
            client_samples = self.sample_client_data(alpha)
            client_data[f'client_{client_id}'] = {
                'data_size': len(client_samples),
                'class_distribution': self.get_class_distribution(client_samples),
                'data_quality': np.random.uniform(0.8, 1.0)  # Simulate varying quality
            }

        return client_data

# Federated learning challenges
federated_challenges = {
    'data_heterogeneity': 'Non-IID data distribution across clients',
    'system_heterogeneity': 'Varying computational capabilities',
    'statistical_heterogeneity': 'Different local data distributions',
    'communication_constraints': 'Limited bandwidth and intermittent connectivity'
}

Evaluation Metrics

Edge-Specific Metrics

def evaluate_edge_model(model, test_dataset, hardware_constraints):
    """Comprehensive evaluation for edge deployment"""

    metrics = {
        'accuracy': calculate_accuracy(model, test_dataset),
        'latency_ms': measure_inference_time(model),
        'memory_usage_mb': measure_memory_footprint(model),
        'energy_consumption_mj': measure_energy_usage(model),
        'model_size_mb': get_model_size(model),
        'throughput_fps': calculate_throughput(model)
    }

    # Edge deployment score
    edge_score = calculate_edge_deployment_score(metrics, hardware_constraints)
    metrics['edge_deployment_score'] = edge_score

    return metrics

# Benchmark comparison
benchmark_comparison = {
    'cloud_model': {'accuracy': '94.2%', 'latency': '150ms', 'size': '250MB'},
    'edge_optimized': {'accuracy': '91.8%', 'latency': '25ms', 'size': '15MB'},
    'ultra_lightweight': {'accuracy': '87.3%', 'latency': '8ms', 'size': '2MB'}
}

Next: Tools - Development and deployment tools for EdgeAI.