diff --git a/v2.5.x/assets/keyword-match.png b/v2.5.x/assets/keyword-match.png index f56818e33..d5bc08069 100644 Binary files a/v2.5.x/assets/keyword-match.png and b/v2.5.x/assets/keyword-match.png differ diff --git a/v2.5.x/site/en/userGuide/search-query-get/full-text-search.md b/v2.5.x/site/en/userGuide/search-query-get/full-text-search.md index 7831a434c..84746b4b3 100644 --- a/v2.5.x/site/en/userGuide/search-query-get/full-text-search.md +++ b/v2.5.x/site/en/userGuide/search-query-get/full-text-search.md @@ -55,6 +55,13 @@ To enable full text search, create a collection with a specific schema. This sch First, create the schema and add the necessary fields:​ +
+ Python + Java + Node.js + cURL +
+ ```python from pymilvus import MilvusClient, DataType, Function, FunctionType​ ​ @@ -66,6 +73,86 @@ schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)​ ``` +```java +import io.milvus.v2.common.DataType; +import io.milvus.v2.service.collection.request.AddFieldReq; +import io.milvus.v2.service.collection.request.CreateCollectionReq; + +CreateCollectionReq.CollectionSchema schema = CreateCollectionReq.CollectionSchema.builder() + .build(); +schema.addField(AddFieldReq.builder() + .fieldName("id") + .dataType(DataType.Int64) + .isPrimaryKey(true) + .autoID(true) + .build()); +schema.addField(AddFieldReq.builder() + .fieldName("text") + .dataType(DataType.VarChar) + .maxLength(1000) + .enableAnalyzer(true) + .build()); +schema.addField(AddFieldReq.builder() + .fieldName("sparse") + .dataType(DataType.SparseFloatVector) + .build()); +``` + +```javascript +import { MilvusClient, DataType } from "@zilliz/milvus2-sdk-node"; + +const address = "http://localhost:19530"; +const token = "root:Milvus"; +const client = new MilvusClient({address, token}); +const schema = [ + { + name: "id", + data_type: DataType.Int64, + is_primary_key: true, + }, + { + name: "text", + data_type: "VarChar", + enable_analyzer: true, + enable_match: true, + max_length: 1000, + }, + { + name: "sparse", + data_type: DataType.SparseFloatVector, + }, +]; + + +console.log(res.results) +``` + +```curl +export schema='{ + "autoId": true, + "enabledDynamicField": false, + "fields": [ + { + "fieldName": "id", + "dataType": "Int64", + "isPrimary": true + }, + { + "fieldName": "text", + "dataType": "VarChar", + "elementTypeParams": { + "max_length": 1000, + "enable_analyzer": true + } + }, + { + "fieldName": "sparse", + "dataType": "SparseFloatVector" + } + ] + }' +``` + In this configuration,​ - `id`: serves as the primary key and is automatically generated with `auto_id=True`.​ @@ -76,6 +163,13 @@ In this configuration,​ Now, define a function that will convert your text into sparse vector representations and then add it to the schema:​ +
+ Python + Java + Node.js + cURL +
+ ```python bm25_function = Function(​ name="text_bm25_emb", # Function name​ @@ -88,6 +182,68 @@ schema.add_function(bm25_function)​ ``` +```java +import io.milvus.common.clientenum.FunctionType; +import io.milvus.v2.service.collection.request.CreateCollectionReq.Function; + +import java.util.*; + +schema.addFunction(Function.builder() + .functionType(FunctionType.BM25) + .name("text_bm25_emb") + .inputFieldNames(Collections.singletonList("text")) + .outputFieldNames(Collections.singletonList("vector")) + .build()); +``` + +```javascript +const functions = [ + { + name: 'text_bm25_emb', + description: 'bm25 function', + type: FunctionType.BM25, + input_field_names: ['text'], + output_field_names: ['vector'], + params: {}, + }, +]; +``` + +```curl +export schema='{ + "autoId": true, + "enabledDynamicField": false, + "fields": [ + { + "fieldName": "id", + "dataType": "Int64", + "isPrimary": true + }, + { + "fieldName": "text", + "dataType": "VarChar", + "elementTypeParams": { + "max_length": 1000, + "enable_analyzer": true + } + }, + { + "fieldName": "sparse", + "dataType": "SparseFloatVector" + } + ], + "functions": [ + { + "name": "text_bm25_emb", + "type": "BM25", + "inputFieldNames": ["text"], + "outputFieldNames": ["sparse"], + "params": {} + } + ] + }' +``` +

Parameter​

Description​

@@ -122,6 +278,13 @@ For collections with multiple `VARCHAR` fields requiring text-to-sparse-vector c After defining the schema with necessary fields and the built-in function, set up the index for your collection. To simplify this process, use `AUTOINDEX` as the `index_type`, an option that allows Milvus to choose and configure the most suitable index type based on the structure of your data.​ + + ```python index_params = MilvusClient.prepare_index_params()​ ​ @@ -133,6 +296,37 @@ index_params.add_index(​ ``` +```java +import io.milvus.v2.common.IndexParam; + +List indexes = new ArrayList<>(); +indexes.add(IndexParam.builder() + .fieldName("sparse") + .indexType(IndexParam.IndexType.SPARSE_INVERTED_INDEX) + .metricType(IndexParam.MetricType.BM25) + .build()); +``` + +```javascript +const index_params = [ + { + fieldName: "sparse", + metricType: "BM25", + indexType: "AUTOINDEX", + }, +]; +``` + +```curl +export indexParams='[ + { + "fieldName": "sparse", + "metricType": "BM25", + "indexType": "AUTOINDEX" + } + ]' +``` +

Parameter​

Description​

@@ -155,6 +349,13 @@ index_params.add_index(​ Now create the collection using the schema and index parameters defined.​ + + ```python MilvusClient.create_collection(​ collection_name='demo', ​ @@ -164,23 +365,115 @@ MilvusClient.create_collection(​ ``` +```java +import io.milvus.v2.service.collection.request.CreateCollectionReq; + +CreateCollectionReq requestCreate = CreateCollectionReq.builder() + .collectionName("demo") + .collectionSchema(schema) + .indexParams(indexes) + .build(); +client.createCollection(requestCreate); +``` + +```javascript +await client.create_collection( + collection_name: 'demo', + schema: schema, + index_params: index_params +); +``` + +```curl +export CLUSTER_ENDPOINT="http://localhost:19530" +export TOKEN="root:Milvus" + +curl --request POST \ +--url "${CLUSTER_ENDPOINT}/v2/vectordb/collections/create" \ +--header "Authorization: Bearer ${TOKEN}" \ +--header "Content-Type: application/json" \ +-d "{ + \"collectionName\": \"demo\", + \"schema\": $schema, + \"indexParams\": $indexParams +}" +``` + ## Insert text data After setting up your collection and index, you're ready to insert text data. In this process, you need only to provide the raw text. The built-in function we defined earlier automatically generates the corresponding sparse vector for each text entry.​ + + ```python -MilvusClient.insert('demo', [​ - {'text': 'Artificial intelligence was founded as an academic discipline in 1956.'},​ - {'text': 'Alan Turing was the first person to conduct substantial research in AI.'},​ - {'text': 'Born in Maida Vale, London, Turing was raised in southern England.'},​ -])​ +client.insert('demo', [ + {'text': 'information retrieval is a field of study.'}, + {'text': 'information retrieval focuses on finding relevant information in large datasets.'}, + {'text': 'data mining and information retrieval overlap in research.'}, +]) ``` +```java +import com.google.gson.Gson; +import com.google.gson.JsonObject; + +import io.milvus.v2.service.vector.request.InsertReq; + +Gson gson = new Gson(); +List rows = Arrays.asList( + gson.fromJson("{\"text\": \"information retrieval is a field of study.\"}", JsonObject.class), + gson.fromJson("{\"text\": \"information retrieval focuses on finding relevant information in large datasets.\"}", JsonObject.class), + gson.fromJson("{\"text\": \"data mining and information retrieval overlap in research.\"}", JsonObject.class) +); + +client.insert(InsertReq.builder() + .collectionName("demo") + .data(rows) + .build()); +``` + +```javascript +await client.insert({ +collection_name: 'demo', +data: [ + {'text': 'information retrieval is a field of study.'}, + {'text': 'information retrieval focuses on finding relevant information in large datasets.'}, + {'text': 'data mining and information retrieval overlap in research.'}, +]); +``` + +```curl +curl --request POST \ +--url "${CLUSTER_ENDPOINT}/v2/vectordb/entities/insert" \ +--header "Authorization: Bearer ${TOKEN}" \ +--header "Content-Type: application/json" \ +-d '{ + "data": [ + {"text": "information retrieval is a field of study."}, + {"text": "information retrieval focuses on finding relevant information in large datasets."}, + {"text": "data mining and information retrieval overlap in research."} + ], + "collectionName": "demo" +}' +``` + ## Perform full text search Once you've inserted data into your collection, you can perform full text searches using raw text queries. Milvus automatically converts your query into a sparse vector and ranks the matched search results using the BM25 algorithm, and then returns the topK (`limit`) results.​ + + ```python search_params = {​ 'params': {'drop_ratio_search': 0.6},​ @@ -188,7 +481,7 @@ search_params = {​ ​ MilvusClient.search(​ collection_name='demo', ​ - data=['Who started AI research?'],​ + data=['whats the focus of information retrieval?'],​ anns_field='sparse',​ limit=3,​ search_params=search_params​ @@ -196,6 +489,56 @@ MilvusClient.search(​ ``` +```java +import io.milvus.v2.service.vector.request.SearchReq; +import io.milvus.v2.service.vector.request.data.EmbeddedText; +import io.milvus.v2.service.vector.response.SearchResp; + +Map searchParams = new HashMap<>(); +searchParams.put("drop_ratio_search", 0.6); +SearchResp searchResp = client.search(SearchReq.builder() + .collectionName("demo") + .data(Collections.singletonList(new EmbeddedText("whats the focus of information retrieval?"))) + .annsField("sparse") + .topK(3) + .searchParams(searchParams) + .outputFields(Collections.singletonList("text")) + .build()); +``` + +```javascript +await client.search( + collection_name: 'demo', + data: ['whats the focus of information retrieval?'], + anns_field: 'sparse', + limit: 3, + params: {'drop_ratio_search': 0.6}, +) +``` + +```curl +curl --request POST \ +--url "${CLUSTER_ENDPOINT}/v2/vectordb/entities/search" \ +--header "Authorization: Bearer ${TOKEN}" \ +--header "Content-Type: application/json" \ +--data-raw '{ + "collectionName": "demo", + "data": [ + "whats the focus of information retrieval?" + ], + "annsField": "sparse", + "limit": 3, + "outputFields": [ + "text" + ], + "searchParams":{ + "params":{ + "drop_ratio_search":0.6 + } + } +}' +``` +

Parameter​

Description​

diff --git a/v2.5.x/site/en/userGuide/search-query-get/keyword-match.md b/v2.5.x/site/en/userGuide/search-query-get/keyword-match.md index 6b7a36af7..16077bad5 100644 --- a/v2.5.x/site/en/userGuide/search-query-get/keyword-match.md +++ b/v2.5.x/site/en/userGuide/search-query-get/keyword-match.md @@ -22,9 +22,9 @@ Milvus integrates [Tantivy](https://github.com/quickwit-oss/tantivy) to power it 2. [Indexing](index-scalar-fields.md): After text analysis, Milvus creates an inverted index that maps each unique token to the documents containing it.​ -When a user performs a text match, the inverted index is used to quickly retrieve all documents containing the keywords. This is much faster than scanning through each document individually.​ +When a user performs a text match, the inverted index is used to quickly retrieve all documents containing the terms. This is much faster than scanning through each document individually.​ -![Text Match](../../../assets/keyword-match.png) +![Text Match](../../../../assets/keyword-match.png) ## Enable text match @@ -34,6 +34,13 @@ Text match works on the `VARCHAR` field type, which is essentially the string da To enable text match for a specific `VARCHAR` field, set both the `enable_analyzer` and `enable_match` parameters to `True` when defining the field schema. This instructs Milvus to tokenize text and create an inverted index for the specified field, allowing fast and efficient text matches.​ + + ```python from pymilvus import MilvusClient, DataType​ ​ @@ -49,6 +56,75 @@ schema.add_field(​ ``` +```java +import io.milvus.v2.common.DataType; +import io.milvus.v2.service.collection.request.AddFieldReq; +import io.milvus.v2.service.collection.request.CreateCollectionReq; + +CreateCollectionReq.CollectionSchema schema = CreateCollectionReq.CollectionSchema.builder() + .enableDynamicField(false) + .build(); + +schema.addField(AddFieldReq.builder() + .fieldName("text") + .dataType(DataType.VarChar) + .maxLength(1000) + .enableAnalyzer(true) + .enableMatch(true) + .build()); + +``` + +```javascript +const schema = [ + { + name: "id", + data_type: DataType.Int64, + is_primary_key: true, + }, + { + name: "text", + data_type: "VarChar", + enable_analyzer: true, + enable_match: true, + max_length: 1000, + }, + { + name: "sparse", + data_type: DataType.SparseFloatVector, + }, +]; + +``` + +```curl +export schema='{ + "autoId": true, + "enabledDynamicField": false, + "fields": [ + { + "fieldName": "id", + "dataType": "Int64", + "isPrimary": true + }, + { + "fieldName": "text", + "dataType": "VarChar", + "elementTypeParams": { + "max_length": 1000, + "enable_analyzer": true, + "enable_match": true + } + }, + { + "fieldName": "sparse", + "dataType": "SparseFloatVector" + } + ] + }' + +``` + ### Optional: Configure an analyzer​ The performance and accuracy of text matching depend on the selected analyzer. Different analyzers are tailored to various languages and text structures, so choosing the right one can significantly impact search results for your specific use case.​ @@ -57,6 +133,13 @@ By default, Milvus uses the `standard` analyzer, which tokenizes text based on w In cases where a different analyzer is required, you can configure one using the `analyzer_params` parameter. For example, to apply the `english` analyzer for processing English text:​ + + ```python analyzer_params={​ "type": "english"​ @@ -73,6 +156,75 @@ schema.add_field(​ ``` +```java +Map analyzerParams = new HashMap<>(); +analyzerParams.put("type", "english"); +schema.addField(AddFieldReq.builder() + .fieldName("text") + .dataType(DataType.VarChar) + .maxLength(200) + .enableAnalyzer(true) + .analyzerParams(analyzerParams) + .enableMatch(true) + .build()); + +``` + +```javascript +const schema = [ + { + name: "id", + data_type: DataType.Int64, + is_primary_key: true, + }, + { + name: "text", + data_type: "VarChar", + enable_analyzer: true, + enable_match: true, + max_length: 1000, + analyzer_params: { type: 'english' }, + }, + { + name: "sparse", + data_type: DataType.SparseFloatVector, + }, +]; + +``` + +```curl +export schema='{ + "autoId": true, + "enabledDynamicField": false, + "fields": [ + { + "fieldName": "id", + "dataType": "Int64", + "isPrimary": true + }, + { + "fieldName": "text", + "dataType": "VarChar", + "elementTypeParams": { + "max_length": 200, + "enable_analyzer": true, + "enable_match": true, + "analyzer_params": {"type": "english"} + } + }, + { + "fieldName": "my_vector", + "dataType": "FloatVector", + "elementTypeParams": { + "dim": "5" + } + } + ] + }' + +``` + Milvus also provides various other analyzers suited to different languages and scenarios. For more details, refer to [​Overview](analyzer-overview.md).​ ## Use text match @@ -81,36 +233,78 @@ Once you have enabled text match for a VARCHAR field in your collection schema, ### TEXT_MATCH expression syntax​ -The `TEXT_MATCH` expression is used to specify the field and the keywords to search for. Its syntax is as follows:​ +The `TEXT_MATCH` expression is used to specify the field and the terms to search for. Its syntax is as follows:​ -```python +``` TEXT_MATCH(field_name, text)​ ``` - `field_name`: The name of the VARCHAR field to search for.​ -- `text`: The keywords to search for. Multiple keywords can be separated by spaces or other appropriate delimiters based on the language and configured analyzer.​ +- `text`: The terms to search for. Multiple terms can be separated by spaces or other appropriate delimiters based on the language and configured analyzer.​ -By default, `TEXT_MATCH` uses the **OR** matching logic, meaning it will return documents that contain any of the specified keywords. For example, to search for documents containing the keywords `machine` or `deep` in the `text` field, use the following expression:​ +By default, `TEXT_MATCH` uses the **OR** matching logic, meaning it will return documents that contain any of the specified terms. For example, to search for documents containing the term `machine` or `deep` in the `text` field, use the following expression:​ + + ```python filter = "TEXT_MATCH(text, 'machine deep')"​ +``` +```java +String filter = "TEXT_MATCH(text, 'machine deep')"; +``` + +```javascript +const filter = "TEXT_MATCH(text, 'machine deep')"; +``` + +```curl +export filter="\"TEXT_MATCH(text, 'machine deep')\"" ``` You can also combine multiple `TEXT_MATCH` expressions using logical operators to perform **AND** matching. For example, to search for documents containing both `machine` and `deep` in the `text` field, use the following expression:​ + + ```python filter = "TEXT_MATCH(text, 'machine') and TEXT_MATCH(text, 'deep')"​ +``` +```java +String filter = "TEXT_MATCH(text, 'machine') and TEXT_MATCH(text, 'deep')"; +``` + +```javascript +const filter = "TEXT_MATCH(text, 'machine') and TEXT_MATCH(text, 'deep')" +``` + +```curl +export filter="\"TEXT_MATCH(text, 'machine') and TEXT_MATCH(text, 'deep')\"" ``` ### Search with text match​ Text match can be used in combination with vector similarity search to narrow the search scope and improve search performance. By filtering the collection using text match before vector similarity search, you can reduce the number of documents that need to be searched, resulting in faster query times.​ -In this example, the `filter` expression filters the search results to only include documents that match the specified keywords `keyword1` or `keyword2`. The vector similarity search is then performed on this filtered subset of documents.​ +In this example, the `filter` expression filters the search results to only include documents that match the specified term `keyword1` or `keyword2`. The vector similarity search is then performed on this filtered subset of documents.​ + ```python # Match entities with `keyword1` or `keyword2`​ @@ -129,11 +323,72 @@ result = MilvusClient.search(​ ``` +```java +String filter = "TEXT_MATCH(text, 'keyword1 keyword2')"; + +SearchResp searchResp = client.search(SearchReq.builder() + .collectionName("YOUR_COLLECTION_NAME") + .annsField("embeddings") + .data(Collections.singletonList(queryVector))) + .filter(filter) + .topK(10) + .outputFields(Arrays.asList("id", "text")) + .build()); +``` + +```javascript +// Match entities with `keyword1` or `keyword2` +const filter = "TEXT_MATCH(text, 'keyword1 keyword2')"; + +// Assuming 'embeddings' is the vector field and 'text' is the VARCHAR field +const result = await client.search( + collection_name: "YOUR_COLLECTION_NAME", // Your collection name + anns_field: "embeddings", // Vector field name + data: [query_vector], // Query vector + filter: filter, + params: {"nprobe": 10}, + limit: 10, // Max. number of results to return + output_fields: ["id", "text"] //Fields to return +); +``` + +```curl +export filter="\"TEXT_MATCH(text, 'keyword1 keyword2')\"" + +export CLUSTER_ENDPOINT="http://localhost:19530" +export TOKEN="root:Milvus" + +curl --request POST \ +--url "${CLUSTER_ENDPOINT}/v2/vectordb/entities/search" \ +--header "Authorization: Bearer ${TOKEN}" \ +--header "Content-Type: application/json" \ +-d '{ + "collectionName": "demo2", + "annsField": "my_vector", + "data": [[0.19886812562848388, 0.06023560599112088, 0.6976963061752597, 0.2614474506242501, 0.838729485096104]], + "filter": '"$filter"', + "searchParams": { + "params": { + "nprobe": 10 + } + }, + "limit": 3, + "outputFields": ["text","id"] +}' +``` + ### Query with text match​ -Text match can also be used for scalar filtering in query operations. By specifying a `TEXT_MATCH` expression in the `expr` parameter of the `query()` method, you can retrieve documents that match the given keywords.​ +Text match can also be used for scalar filtering in query operations. By specifying a `TEXT_MATCH` expression in the `expr` parameter of the `query()` method, you can retrieve documents that match the given terms.​ -The example below retrieves documents where the `text` field contains both keywords `keyword1` and `keyword2`.​ +The example below retrieves documents where the `text` field contains both terms `keyword1` and `keyword2`.​ + + ```python # Match entities with both `keyword1` and `keyword2`​ @@ -147,6 +402,45 @@ result = MilvusClient.query(​ ``` +```java +String filter = "TEXT_MATCH(text, 'keyword1') and TEXT_MATCH(text, 'keyword2')"; + +QueryResp queryResp = client.query(QueryReq.builder() + .collectionName("YOUR_COLLECTION_NAME") + .filter(filter) + .outputFields(Arrays.asList("id", "text")) + .build() +); +``` + +```javascript +// Match entities with both `keyword1` and `keyword2` +const filter = "TEXT_MATCH(text, 'keyword1') and TEXT_MATCH(text, 'keyword2')"; + +const result = await client.query( + collection_name: "YOUR_COLLECTION_NAME", + filter: filter, + output_fields: ["id", "text"] +) +``` + +```curl +export filter="\"TEXT_MATCH(text, 'keyword1') and TEXT_MATCH(text, 'keyword2')\"" + +export CLUSTER_ENDPOINT="http://localhost:19530" +export TOKEN="root:Milvus" + +curl --request POST \ +--url "${CLUSTER_ENDPOINT}/v2/vectordb/entities/query" \ +--header "Authorization: Bearer ${TOKEN}" \ +--header "Content-Type: application/json" \ +-d '{ + "collectionName": "demo2", + "filter": '"$filter"', + "outputFields": ["id", "text"] +}' +``` + ## Considerations - Enabling text matching for a field triggers the creation of an inverted index, which consumes storage resources. Consider storage impact when deciding to enable this feature, as it varies based on text size, unique tokens, and the analyzer used.​ diff --git a/version.json b/version.json index 8085d97f0..afcd7732d 100644 --- a/version.json +++ b/version.json @@ -1,4 +1,4 @@ { - "version": "v2.4.x", + "version": "v2.5.x", "released": "yes" }