Testing Guide

Comprehensive guide to testing in LMM-Vibes.

Running Tests

Basic Test Execution

# Run all tests
pytest

# Run with verbose output
pytest -v

# Run specific test file
pytest tests/test_evaluation.py

# Run specific test function
pytest tests/test_evaluation.py::test_evaluate_model_basic

Test Coverage

# Run with coverage report
pytest --cov=lmmvibes

# Generate HTML coverage report
pytest --cov=lmmvibes --cov-report=html

# Generate XML coverage report (for CI)
pytest --cov=lmmvibes --cov-report=xml

Test Categories

# Run unit tests only
pytest -m "not integration"

# Run integration tests only
pytest -m integration

# Run slow tests
pytest -m slow

# Skip slow tests
pytest -m "not slow"

Writing Tests

Test Structure

import pytest
from lmmvibes.evaluation import evaluate_model

class TestEvaluation:
    """Test suite for evaluation functionality."""

    def test_basic_evaluation(self):
        """Test basic model evaluation."""
        # Arrange
        data = [{"question": "What is 2+2?", "answer": "4", "model_output": "4"}]

        # Act
        results = evaluate_model(data, metrics=["accuracy"])

        # Assert
        assert "accuracy" in results
        assert results["accuracy"] == 1.0

    def test_empty_data(self):
        """Test evaluation with empty data."""
        with pytest.raises(ValueError, match="Data cannot be empty"):
            evaluate_model([])

    @pytest.mark.parametrize("metric", ["accuracy", "bleu", "rouge"])
    def test_metric_computation(self, metric):
        """Test computation of different metrics."""
        data = [{"question": "Test", "answer": "answer", "model_output": "answer"}]
        results = evaluate_model(data, metrics=[metric])
        assert metric in results

Test Fixtures

import pytest

@pytest.fixture
def sample_data():
    """Provide sample data for tests."""
    return [
        {"question": "What is 2+2?", "answer": "4", "model_output": "4"},
        {"question": "What is 3+3?", "answer": "6", "model_output": "6"}
    ]

@pytest.fixture
def evaluation_config():
    """Provide evaluation configuration."""
    return {
        "metrics": ["accuracy", "bleu"],
        "batch_size": 32,
        "save_results": False
    }

def test_evaluation_with_fixtures(sample_data, evaluation_config):
    """Test evaluation using fixtures."""
    results = evaluate_model(sample_data, config=evaluation_config)
    assert "accuracy" in results
    assert "bleu" in results

Mocking

from unittest.mock import patch, MagicMock

def test_external_api_call():
    """Test function that calls external API."""
    with patch('lmmvibes.external_api.call_api') as mock_api:
        mock_api.return_value = {"result": "success"}

        # Your test code here
        result = call_external_function()

        assert result == "success"
        mock_api.assert_called_once()

Test Categories

Unit Tests

Test individual functions and classes in isolation.

def test_metric_computation():
    """Test metric computation logic."""
    from lmmvibes.metrics import AccuracyMetric

    metric = AccuracyMetric()
    predictions = ["4", "6", "8"]
    references = ["4", "6", "8"]

    score = metric.compute(predictions, references)
    assert score == 1.0

Integration Tests

Test interactions between components.

@pytest.mark.integration
def test_full_evaluation_pipeline():
    """Test complete evaluation pipeline."""
    # Load data
    data = load_test_dataset()

    # Run evaluation
    results = evaluate_model(data, metrics=["accuracy", "bleu"])

    # Save results
    save_results(results, "test_results.json")

    # Load and verify
    loaded_results = load_results("test_results.json")
    assert loaded_results == results

Performance Tests

Test performance characteristics.

@pytest.mark.slow
def test_large_dataset_performance():
    """Test performance with large dataset."""
    import time

    # Generate large dataset
    large_data = generate_test_data(10000)

    start_time = time.time()
    results = evaluate_model(large_data, metrics=["accuracy"])
    end_time = time.time()

    # Should complete within reasonable time
    assert end_time - start_time < 60  # 60 seconds

Test Data

Creating Test Data

def generate_test_data(num_samples: int = 100) -> List[Dict]:
    """Generate synthetic test data."""
    import random

    questions = [
        "What is 2+2?",
        "What is the capital of France?",
        "Explain gravity",
        "What is photosynthesis?"
    ]

    data = []
    for i in range(num_samples):
        question = random.choice(questions)
        answer = f"Answer {i}"
        model_output = f"Model output {i}"

        data.append({
            "question": question,
            "answer": answer,
            "model_output": model_output,
            "metadata": {"id": i}
        })

    return data

Test Data Files

Store test data in tests/data/:

tests/
├── data/
│   ├── sample.jsonl
│   ├── large_dataset.jsonl
│   └── edge_cases.jsonl
├── test_evaluation.py
└── test_metrics.py

Assertions and Checks

Basic Assertions

def test_basic_assertions():
    """Test basic assertion patterns."""
    results = evaluate_model(sample_data)

    # Check key exists
    assert "accuracy" in results

    # Check value range
    assert 0.0 <= results["accuracy"] <= 1.0

    # Check type
    assert isinstance(results["accuracy"], float)

    # Check approximate equality
    assert results["accuracy"] == pytest.approx(0.85, rel=0.01)

Custom Assertions

def assert_valid_results(results: Dict):
    """Custom assertion for result validation."""
    required_keys = ["accuracy", "bleu", "rouge"]

    for key in required_keys:
        assert key in results, f"Missing key: {key}"
        assert isinstance(results[key], (int, float)), f"Invalid type for {key}"
        assert 0.0 <= results[key] <= 1.0, f"Value out of range for {key}"

def test_results_validation():
    """Test custom result validation."""
    results = evaluate_model(sample_data)
    assert_valid_results(results)

Error Testing

Testing Exceptions

def test_invalid_input():
    """Test handling of invalid input."""
    with pytest.raises(ValueError, match="Data cannot be empty"):
        evaluate_model([])

    with pytest.raises(TypeError):
        evaluate_model("not a list")

    with pytest.raises(KeyError):
        evaluate_model([{"invalid": "data"}])

Testing Warnings

import warnings

def test_deprecation_warning():
    """Test deprecation warnings."""
    with pytest.warns(DeprecationWarning, match="deprecated"):
        deprecated_function()

Test Configuration

pytest.ini

[tool:pytest]
testpaths = tests
python_files = test_*.py
python_classes = Test*
python_functions = test_*
addopts = -v --tb=short
markers =
    slow: marks tests as slow (deselect with '-m "not slow"')
    integration: marks tests as integration tests
    unit: marks tests as unit tests

conftest.py

import pytest
import tempfile
import os

@pytest.fixture(scope="session")
def temp_dir():
    """Create temporary directory for tests."""
    with tempfile.TemporaryDirectory() as tmpdir:
        yield tmpdir

@pytest.fixture(autouse=True)
def setup_test_environment():
    """Set up test environment."""
    # Set test environment variables
    os.environ["LMMVIBES_TESTING"] = "true"
    yield
    # Cleanup
    if "LMMVIBES_TESTING" in os.environ:
        del os.environ["LMMVIBES_TESTING"]

Continuous Integration

GitHub Actions Example

name: Tests
on: [push, pull_request]
jobs:
  test:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: [3.8, 3.9, 3.10]

    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v3
      with:
        python-version: ${{ matrix.python-version }}

    - name: Install dependencies
      run: |
        pip install -e .
        pip install -r requirements-dev.txt

    - name: Run tests
      run: |
        pytest --cov=lmmvibes --cov-report=xml

    - name: Upload coverage
      uses: codecov/codecov-action@v3
      with:
        file: ./coverage.xml

Best Practices

Test Naming: Use descriptive test names
Test Isolation: Each test should be independent
Fast Tests: Keep unit tests fast (< 1 second)
Coverage: Aim for high code coverage
Documentation: Document complex test scenarios
Maintenance: Keep tests up to date with code changes

Next Steps

Check out Contributing for development guidelines
Read the API Reference to understand the codebase
Look at Basic Usage for usage examples