Sparse vs Dense Embeddings

Understanding the fundamental differences between sparse lexical representations and dense neural embeddings is crucial for building effective search systems.

Interactive Comparison

Fundamental Differences

Sparse Embeddings (Lexical)

High dimensional (vocabulary size: 30K-1M)
Few non-zero values (~10-100 per document)
Exact term matching
Interpretable (each dimension = word)
No training required

Dense Embeddings (Neural)

Low dimensional (128-768 dimensions)
All non-zero values (100% dense)
Semantic matching
Black box (dimensions lack meaning)
Requires training

Sparse Embeddings Deep Dive

TF-IDF (Term Frequency-Inverse Document Frequency)

\text{TF-IDF}(t, d, D) = \text{TF}(t, d) × \text{IDF}(t, D)

Where:

\text{TF}(t, d) = f_t,dΣ_{t' ∈ d} f_t',d = Term frequency
\text{IDF}(t, D) = log |D||\{d ∈ D : t ∈ d\}| = Inverse document frequency

from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectors
vectorizer = TfidfVectorizer(max_features=10000)
sparse_embeddings = vectorizer.fit_transform(documents)

# Sparse matrix stats
print(f"Shape: {sparse_embeddings.shape}")
print(f"Non-zero: {sparse_embeddings.nnz}")
print(f"Sparsity: {1 - sparse_embeddings.nnz / (sparse_embeddings.shape[0] * sparse_embeddings.shape[1]):.2%}")

BM25 (Best Matching 25)

More sophisticated than TF-IDF:

\text{BM25}(D, Q) = Σ_i=1ⁿ \text{IDF}(q_i) · f(q_i, D) · (k₁ + 1)f(q_i, D) + k₁ · (1 - b + b · |D|\text{avgdl})

from rank_bm25 import BM25Okapi

# Create BM25 index
tokenized_docs = [doc.split() for doc in documents]
bm25 = BM25Okapi(tokenized_docs)

# Search
query = "machine learning algorithms"
scores = bm25.get_scores(query.split())
top_k = np.argsort(scores)[-10:][::-1]

Inverted Index

Efficient sparse retrieval:

class InvertedIndex:
    def __init__(self):
        self.index = {}  # term -> list of (doc_id, tf)
        self.doc_lengths = {}
        self.avg_doc_length = 0

    def add_document(self, doc_id, text):
        tokens = text.lower().split()
        self.doc_lengths[doc_id] = len(tokens)

        # Count term frequencies
        term_freqs = {}
        for token in tokens:
            term_freqs[token] = term_freqs.get(token, 0) + 1

        # Update inverted index
        for term, freq in term_freqs.items():
            if term not in self.index:
                self.index[term] = []
            self.index[term].append((doc_id, freq))

        # Update average document length
        self.avg_doc_length = sum(self.doc_lengths.values()) / len(self.doc_lengths)

    def search(self, query, k=10):
        query_terms = query.lower().split()
        doc_scores = {}

        for term in query_terms:
            if term in self.index:
                # IDF score
                idf = math.log(len(self.doc_lengths) / len(self.index[term]))

                for doc_id, tf in self.index[term]:
                    # BM25 scoring
                    doc_len = self.doc_lengths[doc_id]
                    k1, b = 1.2, 0.75

                    score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_len / self.avg_doc_length))
                    doc_scores[doc_id] = doc_scores.get(doc_id, 0) + score

        # Sort and return top-k
        sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
        return sorted_docs[:k]

Dense Embeddings Deep Dive

Sentence-BERT Architecture

from sentence_transformers import SentenceTransformer
import torch.nn as nn

class SentenceBERT(nn.Module):
    def __init__(self, model_name='bert-base-uncased'):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.pooling = MeanPooling()

    def forward(self, input_ids, attention_mask):
        # BERT encoding
        outputs = self.bert(input_ids, attention_mask=attention_mask)

        # Mean pooling
        embeddings = self.pooling(outputs.last_hidden_state, attention_mask)

        # Normalize
        embeddings = F.normalize(embeddings, p=2, dim=1)

        return embeddings

class MeanPooling(nn.Module):
    def forward(self, token_embeddings, attention_mask):
        # Expand mask
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

        # Sum embeddings
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)

        # Sum mask
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        # Mean pooling
        return sum_embeddings / sum_mask

Contrastive Training

def contrastive_loss(embeddings1, embeddings2, temperature=0.07):
    """InfoNCE loss for training dense encoders"""
    # Normalize
    embeddings1 = F.normalize(embeddings1, p=2, dim=1)
    embeddings2 = F.normalize(embeddings2, p=2, dim=1)

    # Compute similarities
    similarities = torch.matmul(embeddings1, embeddings2.T) / temperature

    # Labels: positives are on diagonal
    batch_size = embeddings1.shape[0]
    labels = torch.arange(batch_size).to(embeddings1.device)

    # Cross-entropy loss
    loss = F.cross_entropy(similarities, labels)

    return loss

Approximate Nearest Neighbor Search

import faiss

class DenseIndex:
    def __init__(self, dimension=768):
        # Create FAISS index
        self.index = faiss.IndexFlatIP(dimension)  # Inner product
        self.documents = []

    def add_documents(self, documents, encoder):
        # Encode documents
        embeddings = encoder.encode(documents, batch_size=32)

        # Normalize for cosine similarity
        faiss.normalize_L2(embeddings)

        # Add to index
        self.index.add(embeddings)
        self.documents.extend(documents)

    def search(self, query, encoder, k=10):
        # Encode query
        query_embedding = encoder.encode([query])
        faiss.normalize_L2(query_embedding)

        # Search
        distances, indices = self.index.search(query_embedding, k)

        # Return results
        results = []
        for idx, dist in zip(indices[0], distances[0]):
            results.append({
                'document': self.documents[idx],
                'score': dist
            })

        return results

Comparison Analysis

Retrieval Quality

Aspect	Sparse (BM25)	Dense (BERT)
Exact matching	✅ Excellent	❌ Poor
Synonyms	❌ Cannot handle	✅ Excellent
Typos	❌ Fails	✅ Handles well
Long queries	✅ Good	⚠️ May truncate
Rare terms	✅ Excellent	⚠️ May miss
Common words	⚠️ Down-weighted	✅ Contextual

Performance Metrics

def compare_retrieval_methods(queries, relevant_docs, sparse_index, dense_index):
    """Compare sparse and dense retrieval"""
    results = {'sparse': [], 'dense': []}

    for query in queries:
        # Sparse retrieval
        sparse_results = sparse_index.search(query, k=10)
        sparse_recall = compute_recall(sparse_results, relevant_docs[query])
        results['sparse'].append(sparse_recall)

        # Dense retrieval
        dense_results = dense_index.search(query, k=10)
        dense_recall = compute_recall(dense_results, relevant_docs[query])
        results['dense'].append(dense_recall)

    print(f"Sparse Recall@10: {np.mean(results['sparse']):.3f}")
    print(f"Dense Recall@10: {np.mean(results['dense']):.3f}")

Resource Requirements

Metric	Sparse	Dense
Index size	~10% of corpus	~100% of corpus
Query latency	<10ms	50-200ms
RAM usage	Low	High
GPU required	No	Recommended
Training data	None	Large corpus

Hybrid Approaches

1. Linear Combination

def hybrid_search(query, sparse_index, dense_index, alpha=0.5):
    """Combine sparse and dense scores"""
    # Get candidates from both
    sparse_results = sparse_index.search(query, k=100)
    dense_results = dense_index.search(query, k=100)

    # Combine scores
    combined_scores = {}

    for doc_id, score in sparse_results:
        combined_scores[doc_id] = alpha * normalize_score(score, 'sparse')

    for doc_id, score in dense_results:
        if doc_id in combined_scores:
            combined_scores[doc_id] += (1 - alpha) * normalize_score(score, 'dense')
        else:
            combined_scores[doc_id] = (1 - alpha) * normalize_score(score, 'dense')

    # Sort and return
    sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_results[:10]

2. Reciprocal Rank Fusion

def reciprocal_rank_fusion(rankings, k=60):
    """RRF: Robust fusion method"""
    rrf_scores = {}

    for ranking in rankings:
        for rank, doc_id in enumerate(ranking, 1):
            if doc_id not in rrf_scores:
                rrf_scores[doc_id] = 0
            rrf_scores[doc_id] += 1 / (k + rank)

    # Sort by RRF score
    sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_docs

3. Learned Sparse Representations (SPLADE)

class SPLADE(nn.Module):
    """Learned sparse representations"""
    def __init__(self, bert_model):
        super().__init__()
        self.bert = bert_model

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        hidden = outputs.last_hidden_state

        # Project to vocabulary space
        logits = self.bert.cls(hidden)  # [batch, seq_len, vocab_size]

        # Max pooling over sequence
        weights = torch.max(logits, dim=1).values

        # Sparsify with log saturation
        sparse = torch.log(1 + F.relu(weights))

        return sparse  # [batch, vocab_size] sparse vector

4. Dense-First, Sparse-Refine

def cascade_retrieval(query, sparse_index, dense_index):
    """Two-stage retrieval pipeline"""
    # Stage 1: Dense retrieval for semantic matching
    candidates = dense_index.search(query, k=1000)
    candidate_ids = [c['id'] for c in candidates]

    # Stage 2: Sparse re-ranking for precision
    sparse_scores = []
    for doc_id in candidate_ids:
        score = sparse_index.score_document(query, doc_id)
        sparse_scores.append((doc_id, score))

    # Combine with original dense scores
    final_scores = []
    for i, (doc_id, sparse_score) in enumerate(sparse_scores):
        dense_score = candidates[i]['score']
        combined = 0.3 * dense_score + 0.7 * sparse_score
        final_scores.append((doc_id, combined))

    return sorted(final_scores, key=lambda x: x[1], reverse=True)[:10]

Choosing the Right Approach

Decision Matrix

def choose_retrieval_method(requirements):
    """Recommend retrieval approach based on requirements"""

    if requirements['exact_matching_critical']:
        if requirements['semantic_understanding_needed']:
            return 'hybrid'
        else:
            return 'sparse'

    if requirements['vocabulary_mismatch_common']:
        return 'dense'

    if requirements['latency_critical'] and requirements['latency_budget_ms'] < 20:
        return 'sparse'

    if requirements['index_size_constraint']:
        return 'sparse'

    if requirements['multilingual']:
        return 'dense'

    # Default to hybrid for best quality
    return 'hybrid'

Use Cases

Sparse is Better For:

Legal document search (exact terms matter)
Code search (specific function names)
E-commerce (product IDs, SKUs)
Low-latency requirements (<20ms)

Dense is Better For:

Question answering
Semantic similarity
Cross-lingual search
Conversational search

Hybrid is Better For:

Enterprise search
Academic search
Medical information retrieval
General web search

Implementation Best Practices

1. Index Optimization

# Sparse optimization
sparse_index = BM25Index(
    k1=1.2,  # Term saturation
    b=0.75,  # Length normalization
    epsilon=0.25  # Floor value
)

# Dense optimization
dense_index = faiss.index_factory(
    768,  # Dimension
    "IVF4096,PQ64",  # Inverted file with product quantization
    faiss.METRIC_INNER_PRODUCT
)

2. Query Processing

def process_query(query, mode='hybrid'):
    """Query preprocessing pipeline"""
    # Common preprocessing
    query = query.lower().strip()

    if mode in ['sparse', 'hybrid']:
        # Sparse-specific
        query_sparse = remove_stopwords(query)
        query_sparse = stem_tokens(query_sparse)

    if mode in ['dense', 'hybrid']:
        # Dense-specific
        query_dense = expand_abbreviations(query)
        query_dense = query_dense[:512]  # Truncate for BERT

    return query_sparse, query_dense

3. Evaluation Metrics

def evaluate_retrieval(system, test_queries, relevance_judgments):
    """Comprehensive retrieval evaluation"""
    metrics = {
        'mrr': [],  # Mean Reciprocal Rank
        'map': [],  # Mean Average Precision
        'ndcg': [],  # Normalized Discounted Cumulative Gain
        'recall@k': {1: [], 5: [], 10: [], 100: []}
    }

    for query, relevant in zip(test_queries, relevance_judgments):
        results = system.search(query, k=100)

        # Compute metrics
        metrics['mrr'].append(compute_mrr(results, relevant))
        metrics['map'].append(compute_map(results, relevant))
        metrics['ndcg'].append(compute_ndcg(results, relevant))

        for k in [1, 5, 10, 100]:
            recall = compute_recall_at_k(results[:k], relevant)
            metrics['recall@k'][k].append(recall)

    # Average metrics
    return {
        'MRR': np.mean(metrics['mrr']),
        'MAP': np.mean(metrics['map']),
        'NDCG@10': np.mean(metrics['ndcg']),
        'Recall@10': np.mean(metrics['recall@k'][10])
    }

References

Robertson & Zaragoza "The Probabilistic Relevance Framework: BM25 and Beyond"
Reimers & Gurevych "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks"
Formal et al. "SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking"
Ma et al. "A Replication Study of Dense Passage Retriever"
Khattab & Zaharia "ColBERT: Efficient and Effective Passage Search"