CREATE TABLE documents ( id INT PRIMARY KEY AUTO_INCREMENT, title TEXT, content TEXT, category VARCHAR(50), published_date TIMESTAMP, embedding VECTOR(10) -- 10 维向量 ); INSERT INTO documents (title, content, category, published_date, embedding) SELECT CONCAT('Document Title ', numbers.i) AS title, CONCAT('This is the content for document ', numbers.i, '. It covers topics related to ', CASE numbers.i % 5 WHEN 0 THEN 'technology and innovation.' WHEN 1 THEN 'financial performance and market analysis.' WHEN 2 THEN 'marketing strategies and customer engagement.' WHEN 3 THEN 'legal compliance and regulatory updates.' ELSE 'human resources management and talent development.' END) AS content, CASE numbers.i % 5 WHEN 0 THEN 'tech' WHEN 1 THEN 'finance' WHEN 2 THEN 'marketing' WHEN 3 THEN 'legal' ELSE 'hr' END AS category, DATE_SUB(NOW(), INTERVAL FLOOR(RAND(numbers.i) * 730) DAY) AS published_date, -- 过去两年的随机日期 -- 直接拼接 10 个随机数来构建 10 维向量字符串,避免复杂的子查询 CONCAT('[', ROUND(RAND(numbers.i + 1) * 2 - 1, 4), ',', -- 每个 RAND 调用使用不同的种子,确保随机性 ROUND(RAND(numbers.i + 2) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 3) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 4) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 5) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 6) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 7) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 8) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 9) * 2 - 1, 4), ',', ROUND(RAND(numbers.i + 10) * 2 - 1, 4), ']') AS embedding_str FROM (SELECT @i := @i + 1 AS i FROM information_schema.columns, (SELECT @i := 0) AS t LIMIT 100000) AS numbers;
创建索引
CREATE VECTOR INDEX idx_embedding_hnsw ON documents (embedding) WITH (distance=L2, type=hnsw); CREATE FULLTEXT INDEX idx_fulltext_title_content ON documents (title, content); CREATE INDEX idx_published_date ON documents (published_date);
测试1
通过hnsw索引进行向量查询
use test; SET @query_vector_str = '[0.1, 0.2, -0.1, 0.3, 0.0, -0.2, 0.4, -0.3, 0.1, 0.05]'; SELECT id, title, SUBSTR(content, 1, 100) AS content_preview, category, l2_distance(embedding, @query_vector_str) AS relevance_score FROM documents ORDER BY relevance_score APPROXIMATE LIMIT 10;
测试2 全文索引查询
全文索引查询
use test; SELECT id, title, SUBSTR(content, 1, 100) AS content_preview, category FROM documents WHERE MATCH(title, content) AGAINST ('Title 99 legal' IN BOOLEAN MODE) LIMIT 10;
测试 3 向量+全文索引 hybrid search
向量+全文索引查询
use test; SET @customer_feedback_query_vector_str = '[-0.1, -0.2, 0.1, -0.3, 0.0, 0.2, -0.4, 0.3, -0.1, -0.05]'; SELECT id, title, SUBSTR(content, 1, 100) AS content_preview, l2_distance(embedding, @customer_feedback_query_vector_str) AS relevance_score FROM documents WHERE id IN ( SELECT id FROM documents ORDER BY l2_distance(embedding, @customer_feedback_query_vector_str) APPROXIMATE LIMIT 1000 ) AND MATCH(title, content) AGAINST ('title leg' IN BOOLEAN MODE) ORDER BY relevance_score LIMIT 5;
小结:在进行OceanBase简短的Hybrid search 后,你的公司在云端或是有混合云的打算,且需要Hybrid search能力,但整体的系统希望在兼容MySQL数据库系统上,可以考虑平行迁移到OceanBase MySQL 兼容的云端系统上,因MySQL已经无法再混合搜索,向量搜索再有建树,未来的大量业务将需要向量和hybrid search的能力,如HNSW 等,建议有实力且业务想继续发展的公司可以考虑在往前走一步。