定制化嵌入¶
LlamaIndex 支持来自 OpenAI、Azure 和 Langchain 的嵌入。如果这些不够,你还可以实现任何嵌入模型!
下面的示例使用了 Instructor Embeddings(安装/设置详情请见此处),并实现了一个定制化嵌入类。Instructor embeddings 的工作原理是提供文本以及关于要嵌入文本领域内容的“指令”。这对于嵌入来自非常具体和专业主题的文本很有帮助。
如果你在 colab 上打开此 Notebook,可能需要安装 LlamaIndex 🦙。
In [ ]
已复制!
!pip install llama-index
!pip install llama-index
In [ ]
已复制!
# Install dependencies
# !pip install InstructorEmbedding torch transformers sentence-transformers
# Install dependencies # !pip install InstructorEmbedding torch transformers sentence-transformers
In [ ]
已复制!
import openai
import os
os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY"
openai.api_key = os.environ["OPENAI_API_KEY"]
import openai import os os.environ["OPENAI_API_KEY"] = "YOUR_API_KEY" openai.api_key = os.environ["OPENAI_API_KEY"]
定制化嵌入实现¶
In [ ]
已复制!
from typing import Any, List
from InstructorEmbedding import INSTRUCTOR
from llama_index.core.bridge.pydantic import PrivateAttr
from llama_index.core.embeddings import BaseEmbedding
class InstructorEmbeddings(BaseEmbedding):
_model: INSTRUCTOR = PrivateAttr()
_instruction: str = PrivateAttr()
def __init__(
self,
instructor_model_name: str = "hkunlp/instructor-large",
instruction: str = "Represent a document for semantic search:",
**kwargs: Any,
) -> None:
super().__init__(**kwargs)
self._model = INSTRUCTOR(instructor_model_name)
self._instruction = instruction
@classmethod
def class_name(cls) -> str:
return "instructor"
async def _aget_query_embedding(self, query: str) -> List[float]:
return self._get_query_embedding(query)
async def _aget_text_embedding(self, text: str) -> List[float]:
return self._get_text_embedding(text)
def _get_query_embedding(self, query: str) -> List[float]:
embeddings = self._model.encode([[self._instruction, query]])
return embeddings[0]
def _get_text_embedding(self, text: str) -> List[float]:
embeddings = self._model.encode([[self._instruction, text]])
return embeddings[0]
def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
embeddings = self._model.encode(
[[self._instruction, text] for text in texts]
)
return embeddings
from typing import Any, List from InstructorEmbedding import INSTRUCTOR from llama_index.core.bridge.pydantic import PrivateAttr from llama_index.core.embeddings import BaseEmbedding class InstructorEmbeddings(BaseEmbedding): _model: INSTRUCTOR = PrivateAttr() _instruction: str = PrivateAttr() def __init__( self, instructor_model_name: str = "hkunlp/instructor-large", instruction: str = "Represent a document for semantic search:", **kwargs: Any, ) -> None: super().__init__(**kwargs) self._model = INSTRUCTOR(instructor_model_name) self._instruction = instruction @classmethod def class_name(cls) -> str: return "instructor" async def _aget_query_embedding(self, query: str) -> List[float]: return self._get_query_embedding(query) async def _aget_text_embedding(self, text: str) -> List[float]: return self._get_text_embedding(text) def _get_query_embedding(self, query: str) -> List[float]: embeddings = self._model.encode([[self._instruction, query]]) return embeddings[0] def _get_text_embedding(self, text: str) -> List[float]: embeddings = self._model.encode([[self._instruction, text]]) return embeddings[0] def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]: embeddings = self._model.encode( [[self._instruction, text] for text in texts] ) return embeddings
使用示例¶
In [ ]
已复制!
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex from llama_index.core import Settings
下载数据¶
In [ ]
已复制!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
加载文档¶
In [ ]
已复制!
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
In [ ]
已复制!
embed_model = InstructorEmbeddings(embed_batch_size=2)
Settings.embed_model = embed_model
Settings.chunk_size = 512
# if running for the first time, will download model weights first!
index = VectorStoreIndex.from_documents(documents)
embed_model = InstructorEmbeddings(embed_batch_size=2) Settings.embed_model = embed_model Settings.chunk_size = 512 # 如果是第一次运行,会先下载模型权重! index = VectorStoreIndex.from_documents(documents)
load INSTRUCTOR_Transformer max_seq_length 512
In [ ]
已复制!
response = index.as_query_engine().query("What did the author do growing up?")
print(response)
response = index.as_query_engine().query("作者成长过程中做了什么?") print(response)
The author wrote short stories and also worked on programming, specifically on an IBM 1401 computer in 9th grade. They used an early version of Fortran and had to type programs on punch cards. Later on, they got a microcomputer, a TRS-80, and started programming more extensively, writing simple games and a word processor. They initially planned to study philosophy in college but eventually switched to AI.