WordLift 向量存储¶
介绍¶
此脚本演示了如何抓取产品网站,提取相关信息,构建 SEO 友好的知识图谱(PDP 和 PLP 的结构化表示),并利用它来改善搜索和用户体验。
主要特性与库:¶
- Web 抓取 (Advertools)
- 产品详情页 (PDP) 和产品列表页 (PLP) 的知识图谱创建 - WordLift
- 产品推荐 (WordLift 神经搜索)
- 购物助手创建 (WordLift + LlamaIndex 🦙)
这种方法可以增强电子商务网站的 SEO 性能和用户参与度。
在此处了解更多工作原理
- https://www.youtube.com/watch?v=CH-ir1MTAwQ
- https://wordlift.io/academy-entries/mastering-serp-analysis-knowledge-graphs
![]() |
作者: Andrea Volpini 和 David Riccitelli MIT 许可证 最后更新:2024年7月31日 |
设置¶
!pip install advertools -q
!pip install -U wordlift-client # 🎉 first time on stage 🎉
!pip install rdflib -q
# Standard library imports
import json
import logging
import os
import re
import urllib.parse
import requests
from typing import List, Optional
# Third-party imports
import advertools as adv
import pandas as pd
import nest_asyncio
# RDFLib imports
from rdflib import Graph, Literal, RDF, URIRef
from rdflib.namespace import SDO, Namespace, DefinedNamespace
# WordLift client imports
import wordlift_client
from wordlift_client import Configuration, ApiClient
from wordlift_client.rest import ApiException
from wordlift_client.api.dataset_api import DatasetApi
from wordlift_client.api.entities_api import EntitiesApi
from wordlift_client.api.graph_ql_api import GraphQLApi
from wordlift_client.models.graphql_request import GraphqlRequest
from wordlift_client.models.page_vector_search_query_response_item import (
PageVectorSearchQueryResponseItem,
)
from wordlift_client.models.vector_search_query_request import (
VectorSearchQueryRequest,
)
from wordlift_client.api.vector_search_queries_api import (
VectorSearchQueriesApi,
)
# Asynchronous programming
import asyncio
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Apply nest_asyncio
nest_asyncio.apply()
WORDLIFT_KEY = os.getenv("WORDLIFT_KEY")
OPENAI_KEY = os.getenv("OPENAI_KEY")
使用 Advertools 抓取网站¶
# Step 1: Define the website structure
# -----------------------------------
# We're working with two types of pages:
# 1. Product Listing Pages (PLP): https://product-finder.wordlift.io/product-category/bags/
# 2. Product Detail Pages (PDP): https://product-finder.wordlift.io/product/1980s-marco-polo-crossbody-bag-in-black/
# The product description can be found at this XPath:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()
# The price is here:
# /html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()
# The category is here:
# //span[contains(@class, 'breadcrumb')]/a/text()
# Step 2: Set up the crawl
# ------------------------
def crawl_website(url, output_file, num_pages=10):
logger.info(f"Starting crawl of {url}")
adv.crawl(
url,
output_file,
follow_links=True,
custom_settings={
"CLOSESPIDER_PAGECOUNT": num_pages,
"USER_AGENT": "WordLiftBot/1.0 (Maven Project)",
"CONCURRENT_REQUESTS_PER_DOMAIN": 2,
"DOWNLOAD_DELAY": 1,
"ROBOTSTXT_OBEY": False,
},
xpath_selectors={
"product_description": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/div[2]/div[1]/p/text()",
"product_price": "/html/body/div[1]/div/div/div/div/div[1]/div/div[3]/div/div[2]/p/span/bdi/text()",
"product_category": "//span[@class='posted_in']/a/text()",
},
)
logger.info(f"Crawl completed. Results saved to {output_file}")
# Step 3: Analyze URL patterns
# ----------------------------
def analyze_url_patterns(df):
df["page_type"] = df["url"].apply(
lambda x: "PLP"
if "/product-category/" in x
else ("PDP" if "/product/" in x else "Other")
)
logger.info(
f"Found {(df['page_type'] == 'PLP').sum()} PLPs and {(df['page_type'] == 'PDP').sum()} PDPs"
)
return df
# Step 4: Extract page data
# ----------------------------
def extract_page_data(df):
extracted_data = []
for _, row in df.iterrows():
page = {
"url": row["url"],
"title": row["title"],
"page_type": row["page_type"],
"meta_description": row.get("meta_description", ""),
"og_title": row.get("og_title", ""),
"og_description": row.get("og_description", ""),
"h1": ", ".join(row.get("h1", []))
if isinstance(row.get("h1"), list)
else row.get("h1", ""),
"h2": ", ".join(row.get("h2", []))
if isinstance(row.get("h2"), list)
else row.get("h2", ""),
}
if row["page_type"] == "PDP":
page.update(
{
"product_description": ", ".join(
row.get("product_description", [])
)
if isinstance(row.get("product_description"), list)
else row.get("product_description", ""),
"product_price": ", ".join(row.get("product_price", []))
if isinstance(row.get("product_price"), list)
else row.get("product_price", ""),
"product_category": ", ".join(
row.get("product_category", [])
)
if isinstance(row.get("product_category"), list)
else row.get("product_category", ""),
}
)
elif row["page_type"] == "PLP":
# Parse the category from the H1 content
h1_content = (
row.get("h1", [""])[0]
if isinstance(row.get("h1"), list)
else row.get("h1", "")
)
category = (
h1_content.split("@@")[-1]
if "@@" in h1_content
else h1_content.replace("Category: ", "").strip()
)
page["category_name"] = category
extracted_data.append(page)
return pd.DataFrame(extracted_data)
使用 WordLift 构建知识图谱 🕸¶
# Step 5: Configure the WordLift client
# ----------------------------
# Create a configuration object for the WordLift API client using your WordLift key.
configuration = Configuration(host="https://api.wordlift.io")
configuration.api_key["ApiKey"] = WORDLIFT_KEY
configuration.api_key_prefix["ApiKey"] = "Key"
EXAMPLE_PRIVATE_NS = Namespace("https://ns.example.org/private/")
BASE_URI = "http://data.wordlift.io/[dataset_id]/"
# Step 6: Build the KG and the embeddings
# ----------------------------
async def cleanup_knowledge_graph(api_client):
dataset_api = wordlift_client.DatasetApi(api_client)
try:
# Delete all
await dataset_api.delete_all_entities()
except Exception as e:
print(
"Exception when calling DatasetApi->delete_all_entities: %s\n" % e
)
async def create_entity(entities_api, entity_data):
g = Graph().parse(data=json.dumps(entity_data), format="json-ld")
body = g.serialize(format="application/rdf+xml")
await entities_api.create_or_update_entities(
body=body, _content_type="application/rdf+xml"
)
def replace_url(original_url: str) -> str:
old_domain = "https://product-finder.wordlift.io/"
new_domain = "https://data-science-with-python-for-seo.wordlift.dev/"
if original_url.startswith(old_domain):
return original_url.replace(old_domain, new_domain, 1)
else:
return original_url
def create_entity_uri(url):
parsed_url = urllib.parse.urlparse(url)
path = parsed_url.path.strip("/")
path_parts = path.split("/")
fragment = parsed_url.fragment
if "product" in path_parts:
# It's a product page or product offer
product_id = path_parts[-1] # Get the last part of the path
if fragment == "offer":
return f"{BASE_URI}offer_{product_id}"
else:
return f"{BASE_URI}product_{product_id}"
elif "product-category" in path_parts:
# It's a product listing page (PLP)
category = path_parts[-1] # Get the last part of the path
return f"{BASE_URI}plp_{category}"
else:
# For any other type of page
safe_path = "".join(c if c.isalnum() else "_" for c in path)
if fragment == "offer":
return f"{BASE_URI}offer_{safe_path}"
else:
return f"{BASE_URI}page_{safe_path}"
def clean_price(price_str):
if not price_str or price_str == "N/A":
return None
if isinstance(price_str, (int, float)):
return float(price_str)
try:
# Remove any non-numeric characters except for the decimal point
cleaned_price = "".join(
char for char in str(price_str) if char.isdigit() or char == "."
)
return float(cleaned_price)
except ValueError:
logger.warning(f"Could not convert price: {price_str}")
return None
def create_product_entity(row, dataset_uri):
url = replace_url(row["url"])
product_entity_uri = create_entity_uri(url)
entity_data = {
"@context": "http://schema.org",
"@type": "Product",
"@id": product_entity_uri,
"url": url,
"name": row["title"]
if not pd.isna(row["title"])
else "Untitled Product",
"urn:meta:requestEmbeddings": [
"http://schema.org/name",
"http://schema.org/description",
],
}
if not pd.isna(row.get("product_description")):
entity_data["description"] = row["product_description"]
if not pd.isna(row.get("product_price")):
price = clean_price(row["product_price"])
if price is not None:
# Create offer ID as a sub-resource of the product ID
offer_entity_uri = f"{product_entity_uri}/offer_1"
entity_data["offers"] = {
"@type": "Offer",
"@id": offer_entity_uri,
"price": str(price),
"priceCurrency": "GBP",
"availability": "http://schema.org/InStock",
"url": url,
}
if not pd.isna(row.get("product_category")):
entity_data["category"] = row["product_category"]
custom_attributes = {
key: row[key]
for key in [
"meta_description",
"og_title",
"og_description",
"h1",
"h2",
]
if not pd.isna(row.get(key))
}
if custom_attributes:
entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
custom_attributes
)
return entity_data
def create_collection_entity(row, dataset_uri):
url = replace_url(row["url"])
entity_uri = create_entity_uri(url)
entity_data = {
"@context": "http://schema.org",
"@type": "CollectionPage",
"@id": entity_uri,
"url": url,
"name": row["category_name"] or row["title"],
}
custom_attributes = {
key: row[key]
for key in [
"meta_description",
"og_title",
"og_description",
"h1",
"h2",
]
if row.get(key)
}
if custom_attributes:
entity_data[str(EXAMPLE_PRIVATE_NS.attributes)] = json.dumps(
custom_attributes
)
return entity_data
async def build_knowledge_graph(df, dataset_uri, api_client):
entities_api = EntitiesApi(api_client)
for _, row in df.iterrows():
try:
if row["page_type"] == "PDP":
entity_data = create_product_entity(row, dataset_uri)
elif row["page_type"] == "PLP":
entity_data = create_collection_entity(row, dataset_uri)
else:
logger.warning(
f"Skipping unknown page type for URL: {row['url']}"
)
continue
if entity_data is None:
logger.warning(
f"Skipping page due to missing critical data: {row['url']}"
)
continue
await create_entity(entities_api, entity_data)
logger.info(
f"Created entity for {row['page_type']}: {row['title']}"
)
except Exception as e:
logger.error(
f"Error creating entity for {row['page_type']}: {row['title']}"
)
logger.error(f"Error: {str(e)}")
运行演示¶
# ----------------------------
# Main Execution
# ----------------------------
# Global configuration variables
CRAWL_URL = "https://product-finder.wordlift.io/"
OUTPUT_FILE = "crawl_results.jl"
async def main():
# Step 1: Crawl the website
crawl_website(CRAWL_URL, OUTPUT_FILE)
# Step 2: Load the crawled data
df = pd.read_json(OUTPUT_FILE, lines=True)
# Step 3: Analyze URL patterns
df = analyze_url_patterns(df)
# Step 4: Extract page data
pages_df = extract_page_data(df)
async with ApiClient(configuration) as api_client:
# Clean up the existing knowledge graph
try:
await cleanup_knowledge_graph(api_client)
logger.info(f"Knowledge Graph Cleaned Up")
except Exception as e:
logger.error(
f"Failed to clean up the existing Knowledge Graph: {str(e)}"
)
return # Exit if cleanup fails
# Build the new knowledge graph
await build_knowledge_graph(pages_df, CRAWL_URL, api_client)
logger.info("Knowledge graph building completed.")
if __name__ == "__main__":
asyncio.run(main())
现在使用 GraphQL 查询知识图谱中的产品¶
async def perform_graphql_query(api_client):
graphql_api = GraphQLApi(api_client)
query = """
{
products(rows: 20) {
id: iri
category: string(name:"schema:category")
name: string(name:"schema:name")
description: string(name:"schema:description")
url: string(name:"schema:url")
}
}
"""
request = GraphqlRequest(query=query)
try:
response = await graphql_api.graphql_using_post(body=request)
print("GraphQL Query Results:")
print(json.dumps(response, indent=2))
except Exception as e:
logger.error(f"An error occurred during GraphQL query: {e}")
async with ApiClient(configuration) as api_client:
# Step 6: Perform GraphQL query
await perform_graphql_query(api_client)
logger.info("Knowledge graph building and GraphQL query completed.")
利用知识图谱¶
现在,我们已成功为我们的电子商务网站创建了知识图谱,并包含了产品嵌入,我们可以利用它来增强用户体验和功能。我们为每个产品生成的嵌入使我们能够执行语义相似性搜索并构建更智能的系统。
将结构化数据添加到网页¶
在本节中,我们将对 WordLift 的数据 API 进行一个简单的测试。此 API 用于将结构化数据标记从知识图谱 (KG) 注入到您的网页中。结构化数据有助于搜索引擎更好地理解您的内容,可能在搜索结果中显示富媒体摘要并改善 SEO。
对于此 Notebook,我们在一个演示电子商务网站上使用预配置的 KG。我们将引用一个虚构的 URL:https://data-science-with-python-for-seo.wordlift.dev
。
调用 WordLift 的数据 API 时,我们只需传递一个 URL,即可接收相应的 JSON-LD (用于链接数据的 JavaScript 对象表示法)。此结构化数据通常包括电子商务网站的产品详情、价格和库存等信息。
下面的 get_json_ld_from_url()
函数演示了此过程。它将 URL 作为输入,并以 JSON-LD 格式返回结构化数据,可用于注入到您的网页中。
def get_json_ld_from_url(url):
# Construct the API URL by prefixing with 'https://api.wordlift.io/data/https/'
api_url = "https://api.wordlift.io/data/https/" + url.replace(
"https://", ""
)
# Make the GET request to the API
response = requests.get(api_url)
# Check if the request was successful
if response.status_code == 200:
# Parse the JSON-LD from the response
json_ld = response.json()
return json_ld
else:
print(f"Failed to retrieve data: {response.status_code}")
return None
def pretty_print_json(json_obj):
# Pretty print the JSON object
print(json.dumps(json_obj, indent=4))
# Let's run a test
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-pure-deluxe-travel-pack-duo-2/"
json_ld = get_json_ld_from_url(url)
json_ld
使用 WordLift 神经搜索生成相似产品链接¶
有了产品嵌入,我们现在可以利用 WordLift 的神经搜索功能向用户推荐相似产品。此功能通过基于语义相似性展示相关产品,显著增强用户参与度并可能提高销售额。
与传统的关键词匹配不同,语义相似性考虑了产品描述的上下文和含义。即使产品不共享确切的关键词,这种方法也能实现更细致、更准确的推荐。
我们之前定义的 get_top_k_similar_urls
函数实现了此功能。它接受产品 URL 作为输入,并返回按相似性得分排序的语义相似产品列表。
例如,如果用户正在查看一件红色棉 T 恤,此功能可能会推荐其他颜色的棉 T 恤,或由不同材质制成的类似风格上衣。这为用户创造了更直观、更引人入胜的购物体验。
通过实现此神经搜索功能,我们能够创建更个性化、更高效的购物体验,从而可能提高用户满意度和转化率。
async def get_top_k_similar_urls(configuration, query_url: str, top_k: int):
request = VectorSearchQueryRequest(
query_url=query_url,
similarity_top_k=top_k,
)
async with wordlift_client.ApiClient(configuration) as api_client:
api_instance = VectorSearchQueriesApi(api_client)
try:
page = await api_instance.create_query(
vector_search_query_request=request
)
return [
{
"url": item.id,
"name": item.text.split("\n")[0],
"score": item.score,
}
for item in page.items
if item.id and item.text
]
except Exception as e:
logger.error(f"Error querying for entities: {e}", exc_info=True)
return None
top_k = 10
url = "https://data-science-with-python-for-seo.wordlift.dev/product/100-mineral-sunscreen-spf-30/"
similar_urls = await get_top_k_similar_urls(
configuration, query_url=url, top_k=top_k
)
print(json.dumps(similar_urls, indent=2))
使用 LlamaIndex 🦙 为电子商务网站构建聊天机器人¶
我们创建的知识图谱是构建智能聊天机器人的完美基础。LlamaIndex(以前称为 GPT Index)是一个强大的数据框架,允许我们在大型语言模型 (LLM) 中摄取、组织和访问私有或领域特定数据。借助 LlamaIndex,我们可以创建一个了解我们产品目录并能有效协助客户的上下文感知聊天机器人。
通过将 LlamaIndex 与我们的知识图谱结合使用,我们可以开发一个能够响应直接查询的聊天机器人。此聊天机器人将了解产品目录,使其能够:
- 回答有关产品规格、库存和价格的问题
- 根据客户偏好进行个性化产品推荐
- 提供相似产品之间的比较
这种方法可以实现与客户之间更自然、更有帮助的互动,从而增强他们的购物体验。聊天机器人可以利用知识图谱中的结构化数据,使用 LlamaIndex 通过 LLM 高效地检索和呈现相关信息。
在接下来的部分中,我们将逐步讲解如何使用我们的知识图谱数据设置 LlamaIndex,并创建一个能够智能协助电子商务客户的聊天机器人。
安装 LlamaIndex
和 WordliftVectorStore
💪¶
%%capture
!pip install llama-index
!pip install -U 'git+https://github.com/wordlift/llama_index.git#egg=llama-index-vector-stores-wordlift&subdirectory=llama-index-integrations/vector_stores/llama-index-vector-stores-wordlift'
!pip install llama-index-embeddings-nomic
# import the necessary modules
from llama_index.vector_stores.wordlift import WordliftVectorStore
from llama_index.core import VectorStoreIndex
为查询引擎设置 NomicEmbeddings¶
Nomic 发布了 v1.5 🪆🪆🪆 版本的嵌入模型,这显著改进了文本嵌入能力。嵌入是文本的数值表示,捕获语义含义,使我们的系统能够理解和比较查询和文档的内容。
Nomic v1.5 的主要特性包括
- 维度在 64 到 768 之间的可变大小嵌入
- Matryoshka 学习,允许嵌套表示
- 上下文大小扩展到 8192 个 Token
由于这些高级特性,我们在 WordLift 中使用 NomicEmbeddings,现在我们也将其配置到 LlamaIndex 中,以便在编码用户查询时使用。这种跨技术栈的嵌入模型一致性确保了我们的知识图谱与查询理解过程之间更好的对齐。
可以在此处找到更多关于 NomicEmbeddings 的信息。
前往此处获取您的免费密钥。
from llama_index.embeddings.nomic import NomicEmbedding
nomic_api_key = os.getenv("NOMIC_KEY")
embed_model = NomicEmbedding(
api_key=nomic_api_key,
dimensionality=128,
model_name="nomic-embed-text-v1.5",
)
embedding = embed_model.get_text_embedding("Hey Ho SEO!")
len(embedding)
我们将使用 OpenAI 作为默认的 LLM 来生成回复。当然,我们也可以使用任何其他可用的 LLM。
# Set the environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_KEY
现在使用知识图谱中的数据设置 WordliftVectorStore。
# Let's configure WordliftVectorStore using our WL Key
vector_store = WordliftVectorStore(key=API_KEY)
# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
vector_store, embed_model=embed_model
)
# Create a query engine
query_engine = index.as_query_engine()
query1 = "Can you give me a product similar to the facial puff? Please add the URL also"
result1 = query_engine.query(query1)
print(result1)
# Function to handle queries
def query_engine(query):
# Create an index from the vector store
index = VectorStoreIndex.from_vector_store(
vector_store, embed_model=embed_model
)
# Create a query engine
query_engine = index.as_query_engine()
response = query_engine.query(query)
return response
# Interactive query loop
while True:
user_query = input("Enter your query (or 'quit' to exit): ")
if user_query.lower() == "quit":
break
result = query_engine(user_query)
print(result)
print("\n---\n")