From 803b386a7a20f5bbcf87ff2d54bd951a887f33c3 Mon Sep 17 00:00:00 2001 From: Dobiichi-Origami <56953648+Dobiichi-Origami@users.noreply.github.com> Date: Fri, 19 Apr 2024 18:55:58 +0800 Subject: [PATCH] add vdb.ipynb (#468) --- cookbook/RAG/vdb.ipynb | 446 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 446 insertions(+) create mode 100644 cookbook/RAG/vdb.ipynb diff --git a/cookbook/RAG/vdb.ipynb b/cookbook/RAG/vdb.ipynb new file mode 100644 index 00000000..b956464c --- /dev/null +++ b/cookbook/RAG/vdb.ipynb @@ -0,0 +1,446 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 使用 VectorDB 结合 Langchain 进行 RAG\n", + "\n", + "百度向量数据库 VectorDB 是一款纯自研高性能、高性价比、生态丰富且即开即用的向量数据库服务。支持多种索引类型和相似度算法,百亿级向量规模,毫秒级查询延迟。百度向量数据库不仅能配合大模型打造专业知识库,还可以应用于图片搜索,音乐推荐,文本分类等领域\n", + "\n", + "在这篇教程中,我们会演示如何使用 Vector DB 搭配千帆 Python SDK,在 Langchain 中实现 RAG 功能\n", + "\n", + "# 准备工作\n", + "\n", + "首先,我们需要安装 Langchain, 千帆 Python SDK 以及 VectorDB 的相关 Pypi 依赖" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: langchain in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (0.1.9)\n", + "Collecting langchain\n", + " Downloading langchain-0.1.16-py3-none-any.whl.metadata (13 kB)\n", + "Requirement already satisfied: qianfan in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (0.3.5)\n", + "Collecting qianfan\n", + " Downloading qianfan-0.3.9-py3-none-any.whl.metadata (10 kB)\n", + "Collecting pymochow\n", + " Downloading pymochow-1.1.4-py3-none-any.whl.metadata (311 bytes)\n", + "Collecting pdfplumber\n", + " Using cached pdfplumber-0.11.0-py3-none-any.whl.metadata (39 kB)\n", + "Requirement already satisfied: PyYAML>=5.3 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (6.0.1)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (2.0.27)\n", + "Requirement already satisfied: aiohttp<4.0.0,>=3.8.3 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (3.8.6)\n", + "Requirement already satisfied: async-timeout<5.0.0,>=4.0.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (4.0.3)\n", + "Requirement already satisfied: dataclasses-json<0.7,>=0.5.7 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (0.6.4)\n", + "Requirement already satisfied: jsonpatch<2.0,>=1.33 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (1.33)\n", + "Collecting langchain-community<0.1,>=0.0.32 (from langchain)\n", + " Downloading langchain_community-0.0.33-py3-none-any.whl.metadata (8.5 kB)\n", + "Collecting langchain-core<0.2.0,>=0.1.42 (from langchain)\n", + " Downloading langchain_core-0.1.44-py3-none-any.whl.metadata (5.9 kB)\n", + "Collecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)\n", + " Using cached langchain_text_splitters-0.0.1-py3-none-any.whl.metadata (2.0 kB)\n", + "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n", + " Downloading langsmith-0.1.49-py3-none-any.whl.metadata (13 kB)\n", + "Requirement already satisfied: numpy<2,>=1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (1.24.4)\n", + "Requirement already satisfied: pydantic<3,>=1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (2.5.3)\n", + "Requirement already satisfied: requests<3,>=2 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (2.31.0)\n", + "Requirement already satisfied: tenacity<9.0.0,>=8.1.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from langchain) (8.2.3)\n", + "Requirement already satisfied: aiolimiter>=1.1.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (1.1.0)\n", + "Requirement already satisfied: bce-python-sdk>=0.8.79 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (0.9.5)\n", + "Collecting diskcache<6.0.0,>=5.6.3 (from qianfan)\n", + " Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)\n", + "Requirement already satisfied: multiprocess in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (0.70.15)\n", + "Requirement already satisfied: prompt-toolkit>=3.0.38 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (3.0.43)\n", + "Requirement already satisfied: python-dotenv>=1.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (1.0.1)\n", + "Requirement already satisfied: rich>=13.0.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (13.7.1)\n", + "Requirement already satisfied: typer>=0.9.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (0.9.0)\n", + "Requirement already satisfied: typing-extensions>=4.0.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from qianfan) (4.7.1)\n", + "Requirement already satisfied: orjson in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pymochow) (3.9.15)\n", + "Requirement already satisfied: future in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pymochow) (1.0.0)\n", + "Collecting pdfminer.six==20231228 (from pdfplumber)\n", + " Using cached pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)\n", + "Requirement already satisfied: Pillow>=9.1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pdfplumber) (10.2.0)\n", + "Collecting pypdfium2>=4.18.0 (from pdfplumber)\n", + " Downloading pypdfium2-4.29.0-py3-none-macosx_11_0_arm64.whl.metadata (48 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m48.5/48.5 kB\u001b[0m \u001b[31m506.7 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: charset-normalizer>=2.0.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pdfminer.six==20231228->pdfplumber) (3.3.2)\n", + "Requirement already satisfied: cryptography>=36.0.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pdfminer.six==20231228->pdfplumber) (41.0.5)\n", + "Requirement already satisfied: attrs>=17.3.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.2.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.5)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.4)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.3)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.3.1)\n", + "Requirement already satisfied: pycryptodome>=3.8.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from bce-python-sdk>=0.8.79->qianfan) (3.20.0)\n", + "Requirement already satisfied: six>=1.4.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from bce-python-sdk>=0.8.79->qianfan) (1.16.0)\n", + "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (3.20.2)\n", + "Requirement already satisfied: typing-inspect<1,>=0.4.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from dataclasses-json<0.7,>=0.5.7->langchain) (0.9.0)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.4)\n", + "Collecting packaging<24.0,>=23.2 (from langchain-core<0.2.0,>=0.1.42->langchain)\n", + " Using cached packaging-23.2-py3-none-any.whl.metadata (3.2 kB)\n", + "Requirement already satisfied: wcwidth in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from prompt-toolkit>=3.0.38->qianfan) (0.2.13)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (0.5.0)\n", + "Requirement already satisfied: pydantic-core==2.14.6 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from pydantic<3,>=1->langchain) (2.14.6)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from requests<3,>=2->langchain) (3.6)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from requests<3,>=2->langchain) (2.0.7)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from requests<3,>=2->langchain) (2024.2.2)\n", + "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from rich>=13.0.0->qianfan) (2.2.0)\n", + "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from rich>=13.0.0->qianfan) (2.17.2)\n", + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from typer>=0.9.0->qianfan) (8.1.7)\n", + "Requirement already satisfied: dill>=0.3.7 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from multiprocess->qianfan) (0.3.7)\n", + "Requirement already satisfied: cffi>=1.12 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber) (1.16.0)\n", + "Requirement already satisfied: mdurl~=0.1 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from markdown-it-py>=2.2.0->rich>=13.0.0->qianfan) (0.1.2)\n", + "Requirement already satisfied: mypy-extensions>=0.3.0 in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain) (1.0.0)\n", + "Requirement already satisfied: pycparser in /Users/pengyiyang/miniconda3/envs/py39/lib/python3.9/site-packages (from cffi>=1.12->cryptography>=36.0.0->pdfminer.six==20231228->pdfplumber) (2.21)\n", + "Downloading langchain-0.1.16-py3-none-any.whl (817 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m817.7/817.7 kB\u001b[0m \u001b[31m3.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading qianfan-0.3.9-py3-none-any.whl (370 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m370.4/370.4 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading pymochow-1.1.4-py3-none-any.whl (41 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.1/41.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hUsing cached pdfplumber-0.11.0-py3-none-any.whl (56 kB)\n", + "Using cached pdfminer.six-20231228-py3-none-any.whl (5.6 MB)\n", + "Using cached diskcache-5.6.3-py3-none-any.whl (45 kB)\n", + "Downloading langchain_community-0.0.33-py3-none-any.whl (1.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.9/1.9 MB\u001b[0m \u001b[31m5.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hDownloading langchain_core-0.1.44-py3-none-any.whl (290 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m290.2/290.2 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hUsing cached langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)\n", + "Downloading langsmith-0.1.49-py3-none-any.whl (115 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.2/115.2 kB\u001b[0m \u001b[31m6.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hDownloading pypdfium2-4.29.0-py3-none-macosx_11_0_arm64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hUsing cached packaging-23.2-py3-none-any.whl (53 kB)\n", + "Installing collected packages: pypdfium2, packaging, diskcache, pymochow, qianfan, pdfminer.six, langsmith, pdfplumber, langchain-core, langchain-text-splitters, langchain-community, langchain\n", + " Attempting uninstall: packaging\n", + " Found existing installation: packaging 24.0\n", + " Uninstalling packaging-24.0:\n", + " Successfully uninstalled packaging-24.0\n", + " Attempting uninstall: qianfan\n", + " Found existing installation: qianfan 0.3.5\n", + " Uninstalling qianfan-0.3.5:\n", + " Successfully uninstalled qianfan-0.3.5\n", + " Attempting uninstall: langsmith\n", + " Found existing installation: langsmith 0.1.8\n", + " Uninstalling langsmith-0.1.8:\n", + " Successfully uninstalled langsmith-0.1.8\n", + " Attempting uninstall: langchain-core\n", + " Found existing installation: langchain-core 0.1.26\n", + " Uninstalling langchain-core-0.1.26:\n", + " Successfully uninstalled langchain-core-0.1.26\n", + " Attempting uninstall: langchain-community\n", + " Found existing installation: langchain-community 0.0.24\n", + " Uninstalling langchain-community-0.0.24:\n", + " Successfully uninstalled langchain-community-0.0.24\n", + " Attempting uninstall: langchain\n", + " Found existing installation: langchain 0.1.9\n", + " Uninstalling langchain-0.1.9:\n", + " Successfully uninstalled langchain-0.1.9\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "datasets 2.17.1 requires huggingface-hub>=0.19.4, but you have huggingface-hub 0.16.4 which is incompatible.\n", + "opencompass 0.2.2 requires numpy==1.23.4, but you have numpy 1.24.4 which is incompatible.\n", + "opencompass 0.2.2 requires tqdm==4.64.1, but you have tqdm 4.66.2 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed diskcache-5.6.3 langchain-0.1.16 langchain-community-0.0.33 langchain-core-0.1.44 langchain-text-splitters-0.0.1 langsmith-0.1.49 packaging-23.2 pdfminer.six-20231228 pdfplumber-0.11.0 pymochow-1.1.4 pypdfium2-4.29.0 qianfan-0.3.9\n" + ] + } + ], + "source": [ + "! pip install -U langchain qianfan pymochow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "然后,我们还需要设置相关环境变量,以运行示例代码" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pymochow.auth.bce_credentials import BceCredentials\n", + "\n", + "# 定义配置信息\n", + "account = 'root'\n", + "api_key = 'api_key'\n", + "endpoint = 'ip_address'\n", + "\n", + "# 初始化BceCredentials对象\n", + "credentials = BceCredentials(account, api_key)\n", + "\n", + "# 设置千帆AI平台的安全认证信息(AK/SK),通过环境变量\n", + "# 注意替换以下参数为您的Access Key和Secret Key\n", + "os.environ['QIANFAN_ACCESS_KEY'] = 'your_console_access_key'\n", + "os.environ['QIANFAN_SECRET_KEY'] = 'your_console_secret_key'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 创建数据库\n", + "\n", + "在我们设置完基础信息之后,我们需要在 Vevtor DB 中创建相对应的向量数据库" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Error: HTTPConnectionPool(host='172.16.64.3', port=5287): Max retries exceeded with url: /v1/database?create= (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 60] Operation timed out'))\n" + ] + } + ], + "source": [ + "import pymochow\n", + "from pymochow.configuration import Configuration\n", + "\n", + "config_obj = Configuration(credentials=credentials, endpoint=endpoint)\n", + "client = pymochow.MochowClient(config_obj)\n", + "\n", + "database_name = \"document\"\n", + "\n", + "try:\n", + " db = client.create_database(database_name)\n", + "except Exception as e: # 捕获所有类型的异常\n", + " print(f\"Error: {e}\") # 打印异常信息" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "以及创建相对应的向量数据库表" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "\n", + "# 导入pymochow模型相关的类和枚举类型\n", + "from pymochow.model.schema import Schema, Field, VectorIndex, SecondaryIndex, HNSWParams\n", + "from pymochow.model.enum import FieldType, IndexType, MetricType, TableState\n", + "from pymochow.model.table import Partition\n", + "\n", + "# 选择或创建数据库\n", + "db = client.database(database_name)\n", + "\n", + "# 定义数据表的字段\n", + "fields = [\n", + " Field(\"id\", FieldType.UINT64, primary_key=True, partition_key=True, auto_increment=False, not_null=True),\n", + " Field(\"text\", FieldType.STRING),\n", + " Field(\"metadata\", FieldType.STRING),\n", + " Field(\"source\", FieldType.STRING),\n", + " Field(\"vector\", FieldType.FLOAT_VECTOR, not_null=True, dimension=384)\n", + "]\n", + "\n", + "# 定义数据表的索引\n", + "indexes = [\n", + " VectorIndex(index_name=\"vector_idx\", field=\"vector\", index_type=IndexType.HNSW, metric_type=MetricType.L2, params=HNSWParams(m=32, efconstruction=200)),\n", + " SecondaryIndex(index_name=\"author_idx\", field=\"author\")\n", + "]\n", + "\n", + "# 尝试创建数据表,捕获并打印可能出现的异常\n", + "table_name = \"chunks\"\n", + "\n", + "try:\n", + " table = db.create_table(table_name=table_name, replication=1, partition=Partition(partition_num=1), schema=Schema(fields=fields, indexes=indexes))\n", + "except Exception as e: # 捕获所有类型的异常\n", + " print(f\"Error: {e}\") # 打印异常信息\n", + "\n", + "# 轮询数据表状态,直到表状态为NORMAL,表示表已准备好\n", + "while True:\n", + " time.sleep(2) # 每次检查前暂停2秒,减少对服务器的压力\n", + " table = db.describe_table(table_name)\n", + " if table.state == TableState.NORMAL: # 表状态为NORMAL,跳出循环\n", + " break\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 准备向量数据\n", + "\n", + "在经过上述的步骤之后,我们成功在 Vector DB 的实例中创建了一个数据表,可以在接下来的步骤中用于存储向量表示\n", + "\n", + "在完成了向量数据库的创建之后,我们就可以开始尝试向向量数据库中添加数据了。为了演示,我们选择从网页上获取一篇知乎专栏,用于展示如何结合 Langchain 进行数据的向量化存储" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import WebBaseLoader # 用于从网页中加载文档\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter # 用于文本分割\n", + "import qianfan # 千帆AI平台SDK\n", + "import json\n", + "from pymochow.model.table import Row # 用于写入向量数据\n", + "\n", + "\n", + "# 加载PDF文档\n", + "loader = WebBaseLoader(\"https://zhuanlan.zhihu.com/p/85289282\") # 构建网页加载对象\n", + "documents = loader.load() # 加载文档\n", + "\n", + "# 设置文本分割器,指定分割的参数\n", + "# chunk_size定义了每个分割块的字符数,chunk_overlap定义了块之间的重叠字符数\n", + "# separators列表定义了用于分割的分隔符\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=384, \n", + " chunk_overlap=0, \n", + " separators=[\"\\n\\n\", \"\\n\", \" \", \"\", \"。\", \",\"]\n", + ")\n", + "all_splits = text_splitter.split_documents(documents) # 对文档进行分割\n", + "\n", + "# 初始化嵌入模型对象\n", + "# 为了避免请求过速碰到限流限制,我们设置 QPS = 5\n", + "emb = qianfan.Embedding(query_per_second=5)\n", + "\n", + "embeddings = [] # 用于存储每个文本块的嵌入向量\n", + "for chunk in all_splits: # 遍历所有分割的文本块\n", + " # 获取文本块的嵌入向量,使用默认模型Embedding-V1\n", + " resp = emb.do(texts=[chunk.page_content])\n", + " embeddings.append(resp['data'][0]['embedding']) # 将嵌入向量添加到列表中\n", + "\n", + "# 逐行写入向量化数据\n", + "rows = []\n", + "for index, chunk in enumerate(all_splits):\n", + " metadata = \"{}\"\n", + " if chunk.metadata is not None:\n", + " metadata = json.dumps(chunk.metadata)\n", + " row = Row(\n", + " id=index,\n", + " text=chunk.page_content,\n", + " metadata=metadata,\n", + " source=chunk.metadata[\"source\"],\n", + " vector=embeddings[index]\n", + " )\n", + " rows.append(row)\n", + "\n", + "# 选择或创建数据库\n", + "db = client.database(database_name)\n", + "\n", + "try:\n", + " table = db.describe_table(table_name)\n", + " table.upsert(rows=rows) # 批量写入向量数据,一次最多支持写入1000条\n", + " table.rebuild_index(\"vector_idx\") # 创建向量索引,必要步骤\n", + "except Exception as e: # 捕获所有类型的异常\n", + " print(f\"Error: {e}\") # 打印异常信息" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 使用向量数据库直接进行 RAG\n", + "\n", + "当你完成上述两个步骤之后,你就有一个可以直接用于查询的云端向量数据库实例了。\n", + "\n", + "此时,我们可以结合 Langchain 中集成的 Vevtor DB 以及千帆组件来实现在 Langchain 中配合 Vector DB 进行查询" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.vectorstores import BaiduVectorDB\n", + "from langchain_community.vectorstores.baiduvectordb import ConnectionParams, TableParams\n", + "from langchain_community.embeddings import QianfanEmbeddingsEndpoint\n", + "from langchain_community.chat_models import QianfanChatEndpoint\n", + "from langchain.chains import RetrievalQA\n", + "\n", + "# 初始化向量嵌入和连接参数\n", + "embeddings = QianfanEmbeddingsEndpoint()\n", + "conn_params = ConnectionParams(\n", + " endpoint=endpoint,\n", + " account=account,\n", + " api_key=api_key\n", + ")\n", + "\n", + "# 初始化百度云向量数据库\n", + "vector_db = BaiduVectorDB(\n", + " embedding=embeddings,\n", + " connection_params=conn_params,\n", + " table_params=TableParams(384),\n", + " database_name=database_name,\n", + " table_name=table_name,\n", + " drop_old=False,\n", + ")\n", + "\n", + "# 初始化检索器和对话模型\n", + "retriever = vector_db.as_retriever(search_type=\"similarity\")\n", + "qianfan_chat_model = QianfanChatEndpoint(model=\"ERNIE-Bot\", temperature=0.1)\n", + "\n", + "# 初始化问答模块\n", + "qa = RetrievalQA.from_chain_type(llm=qianfan_chat_model, chain_type=\"refine\", retriever=retriever, return_source_documents=True)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "在接下来的部分中,我们可以尝试输入内容,来体验 RAG 的查询返回结果" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# 在 query 变量中输入你的问题\n", + "query = \"明朝开国皇帝是谁\"\n", + "\n", + "res = qa(query)\n", + "answer, docs = res['result'], res['source_documents']\n", + "\n", + "print(answer)\n", + "print(docs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py39", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}