Qdrant 混合搜索¶
Qdrant 通过结合 `稀疏` 向量和 `密集` 向量的搜索结果来支持混合搜索。
`密集` 向量是您可能已经在使用的那种——OpenAI、BGE、SentenceTransformers 等的嵌入模型通常是 `密集` 嵌入模型。它们创建文本片段的数值表示,表现为一长串数字。这些 `密集` 向量可以捕获文本片段的丰富语义。
`稀疏` 向量略有不同。它们使用专门的方法或模型(TF-IDF、BM25、SPLADE 等)来生成向量。这些向量通常大部分为零,因此被称为 `稀疏` 向量。这些 `稀疏` 向量非常擅长捕获特定关键词和类似的细小细节。
本 Notebook 将逐步介绍如何设置和自定义使用 Qdrant 以及 Huggingface 的 `"prithvida/Splade_PP_en_v1"` 变体进行混合搜索。
设置¶
首先,我们设置环境并加载数据。
%pip install -U llama-index llama-index-vector-stores-qdrant fastembed
import os
os.environ["OPENAI_API_KEY"] = "sk-..."
!mkdir -p 'data/'
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader("./data/").load_data()
索引数据¶
现在,我们可以索引我们的数据。
Qdrant 的混合搜索必须从一开始就启用——我们只需设置 enable_hybrid=True
。
除了使用 OpenAI 生成密集向量外,这还将使用 fastembed 本地运行 "prithvida/Splade_PP_en_v1"
的稀疏向量生成。
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, AsyncQdrantClient
# creates a persistant index to disk
client = QdrantClient(host="localhost", port=6333)
aclient = AsyncQdrantClient(host="localhost", port=6333)
# create our vector store with hybrid indexing enabled
# batch_size controls how many nodes are encoded with sparse vectors at once
vector_store = QdrantVectorStore(
"llama2_paper",
client=client,
aclient=aclient,
enable_hybrid=True,
fastembed_sparse_model="Qdrant/bm25",
batch_size=20,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.chunk_size = 512
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
)
query_engine = index.as_query_engine(
similarity_top_k=2, sparse_top_k=12, vector_store_query_mode="hybrid"
)
from IPython.display import display, Markdown
response = query_engine.query(
"How was Llama2 specifically trained differently from Llama1?"
)
display(Markdown(str(response)))
Llama 2 与 Llama 1 的训练方式不同,做出了如下改变:执行更健壮的数据清理、更新数据混合、在总 token 数增加 40% 的数据上训练、上下文长度加倍,以及使用分组查询注意力(GQA)来提高更大模型的推理可扩展性。此外,Llama 2 采用了 Llama 1 的大部分预训练设置和模型架构,但包含架构增强,如增加上下文长度和分组查询注意力。
print(len(response.source_nodes))
2
让我们与完全不使用混合搜索进行比较!
from IPython.display import display, Markdown
query_engine = index.as_query_engine(
similarity_top_k=2,
# sparse_top_k=10,
# vector_store_query_mode="hybrid"
)
response = query_engine.query(
"How was Llama2 specifically trained differently from Llama1?"
)
display(Markdown(str(response)))
Llama 2 与 Llama 1 的训练方式不同,做出了旨在提高性能的改变,例如执行更健壮的数据清理、更新数据混合、在总 token 数增加 40% 的数据上训练、上下文长度加倍,以及使用分组查询注意力(GQA)来提高更大模型的推理可扩展性。
异步支持¶
当然,也支持异步查询(请注意,内存中的 Qdrant 数据在异步和同步客户端之间不共享!)
import nest_asyncio
nest_asyncio.apply()
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core import Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
# create our vector store with hybrid indexing enabled
vector_store = QdrantVectorStore(
collection_name="llama2_paper",
client=client,
aclient=aclient,
enable_hybrid=True,
fastembed_sparse_model="Qdrant/bm25",
batch_size=20,
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
Settings.chunk_size = 512
index = VectorStoreIndex.from_documents(
documents,
storage_context=storage_context,
use_async=True,
)
query_engine = index.as_query_engine(similarity_top_k=2, sparse_top_k=10)
response = await query_engine.aquery(
"What baseline models are measured against in the paper?"
)
from typing import Any, List, Tuple
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
doc_tokenizer = AutoTokenizer.from_pretrained(
"naver/efficient-splade-VI-BT-large-doc"
)
doc_model = AutoModelForMaskedLM.from_pretrained(
"naver/efficient-splade-VI-BT-large-doc"
)
query_tokenizer = AutoTokenizer.from_pretrained(
"naver/efficient-splade-VI-BT-large-query"
)
query_model = AutoModelForMaskedLM.from_pretrained(
"naver/efficient-splade-VI-BT-large-query"
)
def sparse_doc_vectors(
texts: List[str],
) -> Tuple[List[List[int]], List[List[float]]]:
"""
Computes vectors from logits and attention mask using ReLU, log, and max operations.
"""
tokens = doc_tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
if torch.cuda.is_available():
tokens = tokens.to("cuda")
output = doc_model(**tokens)
logits, attention_mask = output.logits, tokens.attention_mask
relu_log = torch.log(1 + torch.relu(logits))
weighted_log = relu_log * attention_mask.unsqueeze(-1)
tvecs, _ = torch.max(weighted_log, dim=1)
# extract the vectors that are non-zero and their indices
indices = []
vecs = []
for batch in tvecs:
indices.append(batch.nonzero(as_tuple=True)[0].tolist())
vecs.append(batch[indices[-1]].tolist())
return indices, vecs
def sparse_query_vectors(
texts: List[str],
) -> Tuple[List[List[int]], List[List[float]]]:
"""
Computes vectors from logits and attention mask using ReLU, log, and max operations.
"""
# TODO: compute sparse vectors in batches if max length is exceeded
tokens = query_tokenizer(
texts, truncation=True, padding=True, return_tensors="pt"
)
if torch.cuda.is_available():
tokens = tokens.to("cuda")
output = query_model(**tokens)
logits, attention_mask = output.logits, tokens.attention_mask
relu_log = torch.log(1 + torch.relu(logits))
weighted_log = relu_log * attention_mask.unsqueeze(-1)
tvecs, _ = torch.max(weighted_log, dim=1)
# extract the vectors that are non-zero and their indices
indices = []
vecs = []
for batch in tvecs:
indices.append(batch.nonzero(as_tuple=True)[0].tolist())
vecs.append(batch[indices[-1]].tolist())
return indices, vecs
vector_store = QdrantVectorStore(
"llama2_paper",
client=client,
enable_hybrid=True,
sparse_doc_fn=sparse_doc_vectors,
sparse_query_fn=sparse_query_vectors,
)
自定义 hybrid_fusion_fn()
¶
默认情况下,使用 Qdrant 执行混合查询时,使用相对分数融合(Relative Score Fusion)来组合从稀疏和密集查询中检索到的节点。
您可以将此函数自定义为任何其他方法(普通去重、倒数排名融合等)。
以下是我们的相对分数融合方法的默认代码,以及如何将其传递给构造函数。
from llama_index.core.vector_stores import VectorStoreQueryResult
def relative_score_fusion(
dense_result: VectorStoreQueryResult,
sparse_result: VectorStoreQueryResult,
alpha: float = 0.5, # passed in from the query engine
top_k: int = 2, # passed in from the query engine i.e. similarity_top_k
) -> VectorStoreQueryResult:
"""
Fuse dense and sparse results using relative score fusion.
"""
# sanity check
assert dense_result.nodes is not None
assert dense_result.similarities is not None
assert sparse_result.nodes is not None
assert sparse_result.similarities is not None
# deconstruct results
sparse_result_tuples = list(
zip(sparse_result.similarities, sparse_result.nodes)
)
sparse_result_tuples.sort(key=lambda x: x[0], reverse=True)
dense_result_tuples = list(
zip(dense_result.similarities, dense_result.nodes)
)
dense_result_tuples.sort(key=lambda x: x[0], reverse=True)
# track nodes in both results
all_nodes_dict = {x.node_id: x for x in dense_result.nodes}
for node in sparse_result.nodes:
if node.node_id not in all_nodes_dict:
all_nodes_dict[node.node_id] = node
# normalize sparse similarities from 0 to 1
sparse_similarities = [x[0] for x in sparse_result_tuples]
max_sparse_sim = max(sparse_similarities)
min_sparse_sim = min(sparse_similarities)
sparse_similarities = [
(x - min_sparse_sim) / (max_sparse_sim - min_sparse_sim)
for x in sparse_similarities
]
sparse_per_node = {
sparse_result_tuples[i][1].node_id: x
for i, x in enumerate(sparse_similarities)
}
# normalize dense similarities from 0 to 1
dense_similarities = [x[0] for x in dense_result_tuples]
max_dense_sim = max(dense_similarities)
min_dense_sim = min(dense_similarities)
dense_similarities = [
(x - min_dense_sim) / (max_dense_sim - min_dense_sim)
for x in dense_similarities
]
dense_per_node = {
dense_result_tuples[i][1].node_id: x
for i, x in enumerate(dense_similarities)
}
# fuse the scores
fused_similarities = []
for node_id in all_nodes_dict:
sparse_sim = sparse_per_node.get(node_id, 0)
dense_sim = dense_per_node.get(node_id, 0)
fused_sim = alpha * (sparse_sim + dense_sim)
fused_similarities.append((fused_sim, all_nodes_dict[node_id]))
fused_similarities.sort(key=lambda x: x[0], reverse=True)
fused_similarities = fused_similarities[:top_k]
# create final response object
return VectorStoreQueryResult(
nodes=[x[1] for x in fused_similarities],
similarities=[x[0] for x in fused_similarities],
ids=[x[1].node_id for x in fused_similarities],
)
vector_store = QdrantVectorStore(
"llama2_paper",
client=client,
enable_hybrid=True,
hybrid_fusion_fn=relative_score_fusion,
)
您可能已经注意到上述函数中的 alpha 参数。这可以直接在 as_query_engine()
调用中设置,它将在向量索引检索器中进行设置。
index.as_query_engine(alpha=0.5, similarity_top_k=2)
自定义混合 Qdrant Collections¶
除了让 llama-index 来做,您也可以提前配置您的 Qdrant 混合 Collections。
注意:如果创建混合索引,向量配置的名称必须是 text-dense
和 text-sparse
。
from qdrant_client import models
client.recreate_collection(
collection_name="llama2_paper",
vectors_config={
"text-dense": models.VectorParams(
size=1536, # openai vector size
distance=models.Distance.COSINE,
)
},
sparse_vectors_config={
"text-sparse": models.SparseVectorParams(
index=models.SparseIndexParams()
)
},
)
# enable hybrid since we created a sparse collection
vector_store = QdrantVectorStore(
collection_name="llama2_paper", client=client, enable_hybrid=True
)