Sparse vs Dense Embeddings
Understanding the fundamental differences between sparse lexical representations and dense neural embeddings is crucial for building effective search systems.
Interactive Comparison
Fundamental Differences
Sparse Embeddings (Lexical)
- High dimensional (vocabulary size: 30K-1M)
- Few non-zero values (~10-100 per document)
- Exact term matching
- Interpretable (each dimension = word)
- No training required
Dense Embeddings (Neural)
- Low dimensional (128-768 dimensions)
- All non-zero values (100% dense)
- Semantic matching
- Black box (dimensions lack meaning)
- Requires training
Sparse Embeddings Deep Dive
TF-IDF (Term Frequency-Inverse Document Frequency)
\text{TF-IDF}(t, d, D) = \text{TF}(t, d) × \text{IDF}(t, D)
Where:
- \text{TF}(t, d) = ft,dΣt' ∈ d ft',d = Term frequency
- \text{IDF}(t, D) = log |D||\{d ∈ D : t ∈ d\}| = Inverse document frequency
from sklearn.feature_extraction.text import TfidfVectorizer # Create TF-IDF vectors vectorizer = TfidfVectorizer(max_features=10000) sparse_embeddings = vectorizer.fit_transform(documents) # Sparse matrix stats print(f"Shape: {sparse_embeddings.shape}") print(f"Non-zero: {sparse_embeddings.nnz}") print(f"Sparsity: {1 - sparse_embeddings.nnz / (sparse_embeddings.shape[0] * sparse_embeddings.shape[1]):.2%}")
BM25 (Best Matching 25)
More sophisticated than TF-IDF:
\text{BM25}(D, Q) = Σi=1n \text{IDF}(qi) · f(qi, D) · (k1 + 1)f(qi, D) + k1 · (1 - b + b · |D|\text{avgdl})
from rank_bm25 import BM25Okapi # Create BM25 index tokenized_docs = [doc.split() for doc in documents] bm25 = BM25Okapi(tokenized_docs) # Search query = "machine learning algorithms" scores = bm25.get_scores(query.split()) top_k = np.argsort(scores)[-10:][::-1]
Inverted Index
Efficient sparse retrieval:
class InvertedIndex: def __init__(self): self.index = {} # term -> list of (doc_id, tf) self.doc_lengths = {} self.avg_doc_length = 0 def add_document(self, doc_id, text): tokens = text.lower().split() self.doc_lengths[doc_id] = len(tokens) # Count term frequencies term_freqs = {} for token in tokens: term_freqs[token] = term_freqs.get(token, 0) + 1 # Update inverted index for term, freq in term_freqs.items(): if term not in self.index: self.index[term] = [] self.index[term].append((doc_id, freq)) # Update average document length self.avg_doc_length = sum(self.doc_lengths.values()) / len(self.doc_lengths) def search(self, query, k=10): query_terms = query.lower().split() doc_scores = {} for term in query_terms: if term in self.index: # IDF score idf = math.log(len(self.doc_lengths) / len(self.index[term])) for doc_id, tf in self.index[term]: # BM25 scoring doc_len = self.doc_lengths[doc_id] k1, b = 1.2, 0.75 score = idf * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * doc_len / self.avg_doc_length)) doc_scores[doc_id] = doc_scores.get(doc_id, 0) + score # Sort and return top-k sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True) return sorted_docs[:k]
Dense Embeddings Deep Dive
Sentence-BERT Architecture
from sentence_transformers import SentenceTransformer import torch.nn as nn class SentenceBERT(nn.Module): def __init__(self, model_name='bert-base-uncased'): super().__init__() self.bert = AutoModel.from_pretrained(model_name) self.pooling = MeanPooling() def forward(self, input_ids, attention_mask): # BERT encoding outputs = self.bert(input_ids, attention_mask=attention_mask) # Mean pooling embeddings = self.pooling(outputs.last_hidden_state, attention_mask) # Normalize embeddings = F.normalize(embeddings, p=2, dim=1) return embeddings class MeanPooling(nn.Module): def forward(self, token_embeddings, attention_mask): # Expand mask input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() # Sum embeddings sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) # Sum mask sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) # Mean pooling return sum_embeddings / sum_mask
Contrastive Training
def contrastive_loss(embeddings1, embeddings2, temperature=0.07): """InfoNCE loss for training dense encoders""" # Normalize embeddings1 = F.normalize(embeddings1, p=2, dim=1) embeddings2 = F.normalize(embeddings2, p=2, dim=1) # Compute similarities similarities = torch.matmul(embeddings1, embeddings2.T) / temperature # Labels: positives are on diagonal batch_size = embeddings1.shape[0] labels = torch.arange(batch_size).to(embeddings1.device) # Cross-entropy loss loss = F.cross_entropy(similarities, labels) return loss
Approximate Nearest Neighbor Search
import faiss class DenseIndex: def __init__(self, dimension=768): # Create FAISS index self.index = faiss.IndexFlatIP(dimension) # Inner product self.documents = [] def add_documents(self, documents, encoder): # Encode documents embeddings = encoder.encode(documents, batch_size=32) # Normalize for cosine similarity faiss.normalize_L2(embeddings) # Add to index self.index.add(embeddings) self.documents.extend(documents) def search(self, query, encoder, k=10): # Encode query query_embedding = encoder.encode([query]) faiss.normalize_L2(query_embedding) # Search distances, indices = self.index.search(query_embedding, k) # Return results results = [] for idx, dist in zip(indices[0], distances[0]): results.append({ 'document': self.documents[idx], 'score': dist }) return results
Comparison Analysis
Retrieval Quality
| Aspect | Sparse (BM25) | Dense (BERT) |
|---|---|---|
| Exact matching | ✅ Excellent | ❌ Poor |
| Synonyms | ❌ Cannot handle | ✅ Excellent |
| Typos | ❌ Fails | ✅ Handles well |
| Long queries | ✅ Good | ⚠️ May truncate |
| Rare terms | ✅ Excellent | ⚠️ May miss |
| Common words | ⚠️ Down-weighted | ✅ Contextual |
Performance Metrics
def compare_retrieval_methods(queries, relevant_docs, sparse_index, dense_index): """Compare sparse and dense retrieval""" results = {'sparse': [], 'dense': []} for query in queries: # Sparse retrieval sparse_results = sparse_index.search(query, k=10) sparse_recall = compute_recall(sparse_results, relevant_docs[query]) results['sparse'].append(sparse_recall) # Dense retrieval dense_results = dense_index.search(query, k=10) dense_recall = compute_recall(dense_results, relevant_docs[query]) results['dense'].append(dense_recall) print(f"Sparse Recall@10: {np.mean(results['sparse']):.3f}") print(f"Dense Recall@10: {np.mean(results['dense']):.3f}")
Resource Requirements
| Metric | Sparse | Dense |
|---|---|---|
| Index size | ~10% of corpus | ~100% of corpus |
| Query latency | <10ms | 50-200ms |
| RAM usage | Low | High |
| GPU required | No | Recommended |
| Training data | None | Large corpus |
Hybrid Approaches
1. Linear Combination
def hybrid_search(query, sparse_index, dense_index, alpha=0.5): """Combine sparse and dense scores""" # Get candidates from both sparse_results = sparse_index.search(query, k=100) dense_results = dense_index.search(query, k=100) # Combine scores combined_scores = {} for doc_id, score in sparse_results: combined_scores[doc_id] = alpha * normalize_score(score, 'sparse') for doc_id, score in dense_results: if doc_id in combined_scores: combined_scores[doc_id] += (1 - alpha) * normalize_score(score, 'dense') else: combined_scores[doc_id] = (1 - alpha) * normalize_score(score, 'dense') # Sort and return sorted_results = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True) return sorted_results[:10]
2. Reciprocal Rank Fusion
def reciprocal_rank_fusion(rankings, k=60): """RRF: Robust fusion method""" rrf_scores = {} for ranking in rankings: for rank, doc_id in enumerate(ranking, 1): if doc_id not in rrf_scores: rrf_scores[doc_id] = 0 rrf_scores[doc_id] += 1 / (k + rank) # Sort by RRF score sorted_docs = sorted(rrf_scores.items(), key=lambda x: x[1], reverse=True) return sorted_docs
3. Learned Sparse Representations (SPLADE)
class SPLADE(nn.Module): """Learned sparse representations""" def __init__(self, bert_model): super().__init__() self.bert = bert_model def forward(self, input_ids, attention_mask): # Get BERT outputs outputs = self.bert(input_ids, attention_mask=attention_mask) hidden = outputs.last_hidden_state # Project to vocabulary space logits = self.bert.cls(hidden) # [batch, seq_len, vocab_size] # Max pooling over sequence weights = torch.max(logits, dim=1).values # Sparsify with log saturation sparse = torch.log(1 + F.relu(weights)) return sparse # [batch, vocab_size] sparse vector
4. Dense-First, Sparse-Refine
def cascade_retrieval(query, sparse_index, dense_index): """Two-stage retrieval pipeline""" # Stage 1: Dense retrieval for semantic matching candidates = dense_index.search(query, k=1000) candidate_ids = [c['id'] for c in candidates] # Stage 2: Sparse re-ranking for precision sparse_scores = [] for doc_id in candidate_ids: score = sparse_index.score_document(query, doc_id) sparse_scores.append((doc_id, score)) # Combine with original dense scores final_scores = [] for i, (doc_id, sparse_score) in enumerate(sparse_scores): dense_score = candidates[i]['score'] combined = 0.3 * dense_score + 0.7 * sparse_score final_scores.append((doc_id, combined)) return sorted(final_scores, key=lambda x: x[1], reverse=True)[:10]
Choosing the Right Approach
Decision Matrix
def choose_retrieval_method(requirements): """Recommend retrieval approach based on requirements""" if requirements['exact_matching_critical']: if requirements['semantic_understanding_needed']: return 'hybrid' else: return 'sparse' if requirements['vocabulary_mismatch_common']: return 'dense' if requirements['latency_critical'] and requirements['latency_budget_ms'] < 20: return 'sparse' if requirements['index_size_constraint']: return 'sparse' if requirements['multilingual']: return 'dense' # Default to hybrid for best quality return 'hybrid'
Use Cases
Sparse is Better For:
- Legal document search (exact terms matter)
- Code search (specific function names)
- E-commerce (product IDs, SKUs)
- Low-latency requirements (<20ms)
Dense is Better For:
- Question answering
- Semantic similarity
- Cross-lingual search
- Conversational search
Hybrid is Better For:
- Enterprise search
- Academic search
- Medical information retrieval
- General web search
Implementation Best Practices
1. Index Optimization
# Sparse optimization sparse_index = BM25Index( k1=1.2, # Term saturation b=0.75, # Length normalization epsilon=0.25 # Floor value ) # Dense optimization dense_index = faiss.index_factory( 768, # Dimension "IVF4096,PQ64", # Inverted file with product quantization faiss.METRIC_INNER_PRODUCT )
2. Query Processing
def process_query(query, mode='hybrid'): """Query preprocessing pipeline""" # Common preprocessing query = query.lower().strip() if mode in ['sparse', 'hybrid']: # Sparse-specific query_sparse = remove_stopwords(query) query_sparse = stem_tokens(query_sparse) if mode in ['dense', 'hybrid']: # Dense-specific query_dense = expand_abbreviations(query) query_dense = query_dense[:512] # Truncate for BERT return query_sparse, query_dense
3. Evaluation Metrics
def evaluate_retrieval(system, test_queries, relevance_judgments): """Comprehensive retrieval evaluation""" metrics = { 'mrr': [], # Mean Reciprocal Rank 'map': [], # Mean Average Precision 'ndcg': [], # Normalized Discounted Cumulative Gain 'recall@k': {1: [], 5: [], 10: [], 100: []} } for query, relevant in zip(test_queries, relevance_judgments): results = system.search(query, k=100) # Compute metrics metrics['mrr'].append(compute_mrr(results, relevant)) metrics['map'].append(compute_map(results, relevant)) metrics['ndcg'].append(compute_ndcg(results, relevant)) for k in [1, 5, 10, 100]: recall = compute_recall_at_k(results[:k], relevant) metrics['recall@k'][k].append(recall) # Average metrics return { 'MRR': np.mean(metrics['mrr']), 'MAP': np.mean(metrics['map']), 'NDCG@10': np.mean(metrics['ndcg']), 'Recall@10': np.mean(metrics['recall@k'][10]) }
Related Concepts
- Dense Embeddings - Deep dive into neural embeddings
- Multi-Vector Late Interaction - Advanced dense retrieval
- Quantization Effects - Optimizing embedding storage
References
- Robertson & Zaragoza "The Probabilistic Relevance Framework: BM25 and Beyond"
- Reimers & Gurevych "Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks"
- Formal et al. "SPLADE: Sparse Lexical and Expansion Model for First Stage Ranking"
- Ma et al. "A Replication Study of Dense Passage Retriever"
- Khattab & Zaharia "ColBERT: Efficient and Effective Passage Search"
