🟡 Speech Recognition Basics: Building a Simple Speech-to-Text System

stemaway · October 24, 2024, 8:48am

Speech Recognition Basics: Building a Modern Speech-to-Text System

Objective

Build a practical speech recognition system using modern deep learning approaches. You’ll learn to process audio data, implement speech-to-text conversion, and understand how to use both local models and cloud APIs for real-world applications.

Learning Outcomes

By completing this project, you will:

Master fundamental audio processing and feature extraction techniques
Implement modern speech recognition pipelines using deep learning
Learn to use industry-standard speech recognition APIs
Understand evaluation metrics specific to speech recognition
Gain practical experience with real-world audio data
Learn to handle challenges like noise and different accents

Skills Gained

Processing and analyzing audio data using modern libraries
Implementing speech recognition using deep learning approaches
Using cloud-based speech recognition APIs effectively
Building end-to-end audio processing pipelines
Evaluating speech recognition systems
Handling real-world audio challenges

Tools Required

# Core libraries
pip install torch torchaudio
pip install transformers
pip install librosa
pip install soundfile
pip install google-cloud-speech
pip install pyaudio
pip install jiwer  # for WER calculation

# Visualization
pip install matplotlib
pip install seaborn

Project Structure

speech_recognition/
│
├── data/
│   ├── LibriSpeech/
│   │   ├── train-clean-100/
│   │   └── test-clean/
│   └── custom_audio/
│
├── src/
│   ├── audio_processing.py
│   ├── feature_extraction.py
│   ├── model.py
│   ├── cloud_apis.py
│   └── evaluation.py
│
└── notebooks/
    ├── 1_audio_exploration.ipynb
    ├── 2_model_training.ipynb
    └── 3_evaluation.ipynb

Steps and Tasks

1. Data Acquisition and Setup

First, let’s download and set up the LibriSpeech dataset:

import torchaudio

# Download LibriSpeech dataset (clean subset)
train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url="train-clean-100", download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url="test-clean", download=True)

# Check dataset info
print(f"Training samples: {len(train_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Basic audio exploration:

import librosa
import matplotlib.pyplot as plt

def explore_audio(waveform, sample_rate, title="Waveform"):
    """Plot waveform and spectrogram"""
    plt.figure(figsize=(15, 5))
    
    # Plot waveform
    plt.subplot(1, 2, 1)
    plt.plot(waveform)
    plt.title("Waveform")
    
    # Plot spectrogram
    plt.subplot(1, 2, 2)
    spec = librosa.feature.melspectrogram(y=waveform, sr=sample_rate)
    librosa.display.specshow(librosa.power_to_db(spec), y_axis='mel', x_axis='time')
    plt.title("Mel Spectrogram")
    plt.colorbar(format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()

Click to view advanced audio analysis

class AudioAnalyzer:
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
        
    def analyze_audio_file(self, file_path):
        """Comprehensive audio analysis"""
        # Load audio
        waveform, sr = librosa.load(file_path, sr=self.sample_rate)
        
        # Basic properties
        duration = librosa.get_duration(y=waveform, sr=sr)
        rms = librosa.feature.rms(y=waveform)
        
        # Extract features
        mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
        spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)
        spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)
        
        # Zero crossing rate
        zcr = librosa.feature.zero_crossing_rate(waveform)
        
        return {
            'duration': duration,
            'rms_energy': np.mean(rms),
            'mfccs': mfccs,
            'spectral_centroids': spectral_centroids,
            'spectral_rolloff': spectral_rolloff,
            'zero_crossing_rate': np.mean(zcr)
        }
    
    def plot_features(self, features):
        """Visualize extracted features"""
        fig, axes = plt.subplots(3, 1, figsize=(15, 10))
        
        # Plot MFCCs
        librosa.display.specshow(features['mfccs'], ax=axes[0])
        axes[0].set_title('MFCCs')
        
        # Plot spectral features
        axes[1].plot(features['spectral_centroids'][0])
        axes[1].set_title('Spectral Centroid')
        
        # Plot energy
        axes[2].plot(features['rms_energy'])
        axes[2].set_title('RMS Energy')
        
        plt.tight_layout()
        plt.show()

2. Audio Processing Pipeline

Create a robust audio processing pipeline:

class AudioProcessor:
    def __init__(self, target_sample_rate=16000, duration=10):
        self.target_sample_rate = target_sample_rate
        self.duration = duration
        
    def preprocess_audio(self, waveform, sample_rate):
        """Basic audio preprocessing"""
        # Resample if needed
        if sample_rate != self.target_sample_rate:
            waveform = torchaudio.functional.resample(
                waveform, sample_rate, self.target_sample_rate
            )
            
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
            
        # Normalize
        waveform = waveform / waveform.abs().max()
        
        return waveform

Click to view advanced audio processing

class AdvancedAudioProcessor:
    def __init__(self, target_sample_rate=16000, duration=10):
        self.target_sample_rate = target_sample_rate
        self.duration = duration
        
        # Initialize audio augmentation
        self.augmentation = nn.Sequential(
            torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
            torchaudio.transforms.TimeMasking(time_mask_param=100)
        )
        
    def apply_noise_reduction(self, waveform):
        """Apply noise reduction using spectral subtraction"""
        # Implementation of spectral subtraction
        return denoised_waveform
    
    def apply_augmentation(self, waveform):
        """Apply audio augmentation"""
        return self.augmentation(waveform)
    
    def extract_features(self, waveform):
        """Extract audio features"""
        # Mel spectrogram
        mel_spec = torchaudio.transforms.MelSpectrogram(
            sample_rate=self.target_sample_rate,
            n_mels=128
        )(waveform)
        
        # Log-Mel spectrogram
        log_mel_spec = torchaudio.transforms.AmplitudeToDB()(mel_spec)
        
        return log_mel_spec

3. Model Implementation

We’ll use the Wav2Vec2 model from HuggingFace for speech recognition:

from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

class SpeechRecognizer:
    def __init__(self):
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        
    def transcribe(self, waveform):
        """Transcribe audio to text"""
        # Preprocess
        inputs = self.processor(
            waveform, 
            sampling_rate=16000, 
            return_tensors="pt", 
            padding=True
        )
        
        # Get logits
        with torch.no_grad():
            logits = self.model(inputs.input_values).logits
            
        # Decode
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = self.processor.batch_decode(predicted_ids)
        
        return transcription[0]

Click to view advanced model implementations

class AdvancedSpeechRecognizer:
    def __init__(self, model_name="facebook/wav2vec2-large-960h-lv60-self"):
        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
        self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
        
    def transcribe_with_timestamps(self, waveform):
        """Transcribe audio with word-level timestamps"""
        # Implementation of CTC decoding with timestamps
        pass
    
    def transcribe_with_confidence(self, waveform):
        """Transcribe audio with confidence scores"""
        # Get logits and compute probabilities
        inputs = self.processor(waveform, sampling_rate=16000, return_tensors="pt")
        with torch.no_grad():
            logits = self.model(inputs.input_values).logits
            probs = torch.softmax(logits, dim=-1)
            
        # Decode with confidence scores
        predictions = []
        confidences = []
        
        return predictions, confidences

4. Cloud API Integration

Implement Google Cloud Speech-to-Text API:

from google.cloud import speech

def transcribe_audio_google(audio_path):
    """Transcribe audio using Google Cloud Speech-to-Text"""
    client = speech.SpeechClient()
    
    # Read audio file
    with open(audio_path, "rb") as audio_file:
        content = audio_file.read()
        
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=16000,
        language_code="en-US",
    )
    
    # Perform transcription
    response = client.recognize(config=config, audio=audio)
    
    return " ".join(result.alternatives[0].transcript 
                   for result in response.results)

Click to view multi-API implementation

class CloudSpeechRecognizer:
    def __init__(self):
        self.google_client = speech.SpeechClient()
        # Initialize other API clients as needed
        
    def transcribe_google(self, audio_path, language="en-US"):
        """Transcribe using Google Cloud"""
        # Implementation as above
        
    def transcribe_azure(self, audio_path, language="en-US"):
        """Transcribe using Azure Speech Services"""
        # Azure implementation
        
    def transcribe_aws(self, audio_path, language="en-US"):
        """Transcribe using Amazon Transcribe"""
        # AWS implementation
        
    def transcribe_multiple(self, audio_path):
        """Transcribe using multiple APIs and compare results"""
        results = {
            'google': self.transcribe_google(audio_path),
            'azure': self.transcribe_azure(audio_path),
            'aws': self.transcribe_aws(audio_path)
        }
        return results

5. Evaluation

Implement comprehensive evaluation metrics:

from jiwer import wer, mer, wil

def evaluate_transcription(reference, hypothesis):
    """Evaluate transcription using multiple metrics"""
    metrics = {
        'WER': wer(reference, hypothesis),
        'MER': mer(reference, hypothesis),
        'WIL': wil(reference, hypothesis)
    }
    
    return metrics

# Example usage
reference = "the quick brown fox jumps over the lazy dog"
hypothesis = "the quick brown fox jumps over the lazy"
metrics = evaluate_transcription(reference, hypothesis)
print(metrics)

Click to view advanced evaluation tools

class TranscriptionEvaluator:
    def __init__(self):
        self.metrics_history = []
        
    def evaluate_batch(self, references, hypotheses):
        """Evaluate batch of transcriptions"""
        results = []
        for ref, hyp in zip(references, hypotheses):
            metrics = self.evaluate_single(ref, hyp)
            results.append(metrics)
        
        # Compute average metrics
        avg_metrics = {
            metric: np.mean([r[metric] for r in results])
            for metric in results[0].keys()
        }
        
        self.metrics_history.append(avg_metrics)
        return avg_metrics
    
    def plot_metrics_history(self):
        """Plot metrics over time"""
        metrics_df = pd.DataFrame(self.metrics_history)
        
        plt.figure(figsize=(12, 6))
        for metric in metrics_df.columns:
            plt.plot(metrics_df[metric], label=metric)
            
        plt.title('Transcription Metrics Over Time')
        plt.xlabel('Batch')
        plt.ylabel('Score')
        plt.legend()
        plt.show()

6. Real-Time Speech Recognition

Implement real-time speech recognition:

import pyaudio
import wave

class RealtimeSpeechRecognizer:
    def __init__(self):
        self.recognizer = SpeechRecognizer()
        self.chunk = 1024
        self.format = pyaudio.paFloat32
        self.channels = 1
        self.rate = 16000
        
    def record_audio(self, seconds=5):
        """Record audio from microphone"""
        p = pyaudio.PyAudio()
        
        stream = p.open(format=self.format,
                       channels=self.channels,
                       rate=self.rate,
                       input=True,
                       frames_per_buffer=self.chunk)
        
        frames = []
        for _ in range(0, int(self.rate / self.chunk * seconds)):
            data = stream.read(self.chunk)
            frames.append(data)
            
        stream.stop_stream()
        stream.close()
        p.terminate()
        
        return b''.join(frames)

Click to view streaming implementation

class StreamingSpeechRecognizer:
    def __init__(self):
        self.recognizer = SpeechRecognizer()
        self.audio_config = speech.StreamingRecognitionConfig(
            config=speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=16000,
                language_code="en-US",
                enable_automatic_punctuation=True,
            ),