EFCore默认的索引类型和PostgreSQL GIN索引优化示例
基础数据准备
考虑我们有一张物料表和库存表。
CREATE TABLE IF NOT EXISTS public.mm_material
(
"Id" integer NOT NULL GENERATED BY DEFAULT AS IDENTITY ( INCREMENT 1 START 1 MINVALUE 1 MAXVALUE 2147483647 CACHE 1 ),
"Code" character varying(64) COLLATE pg_catalog."default" NOT NULL,
"ShortName" character varying(256) COLLATE pg_catalog."default" NOT NULL,
"Specification" character varying(50) COLLATE pg_catalog."default" NOT NULL,
"Status" integer NOT NULL,
"MeasureUnitId" integer NOT NULL,
CONSTRAINT "PK_mm_material" PRIMARY KEY ("Id"),
CONSTRAINT "AK_mm_material_Code" UNIQUE ("Code"),
CONSTRAINT "FK_mm_material_mm_material_measureunit_MeasureUnitId" FOREIGN KEY ("MeasureUnitId")
REFERENCES public.mm_material_measureunit ("Id") MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE CASCADE
)
-- 索引略
;
CREATE TABLE IF NOT EXISTS public.mm_inventory
(
"Id" uuid NOT NULL,
"IsLocked" boolean NOT NULL,
"Locker" text COLLATE pg_catalog."default" NOT NULL,
"WarehouseId" integer NOT NULL,
"Slot" text COLLATE pg_catalog."default" NOT NULL,
"Scope" text COLLATE pg_catalog."default" NOT NULL,
"MaterialCode" text COLLATE pg_catalog."default" NOT NULL,
"MaterialId" integer NOT NULL,
"Batch" text COLLATE pg_catalog."default" NOT NULL,
"ExternalCode" text COLLATE pg_catalog."default" NOT NULL,
"Amount" numeric NOT NULL,
"StockKind" integer NOT NULL,
"Note" text COLLATE pg_catalog."default",
"ProducedAt" timestamp with time zone,
"ExpiredAt" timestamp with time zone NOT NULL,
"CreatedAt" timestamp with time zone NOT NULL,
CONSTRAINT "PK_mm_inventory" PRIMARY KEY ("Id"),
CONSTRAINT "FK_mm_inventory_mm_material_MaterialId" FOREIGN KEY ("MaterialId")
REFERENCES public.mm_material ("Id") MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE CASCADE,
CONSTRAINT "FK_mm_inventory_mm_warehouse_WarehouseId" FOREIGN KEY ("WarehouseId")
REFERENCES public.mm_warehouse ("Id") MATCH SIMPLE
ON UPDATE NO ACTION
ON DELETE CASCADE
)
-- 索引略
;
为了测试,我们向这张物料单表中插入大概10万条测试数据:
insert into mm_material ("Code", "ShortName", "Specification","Status", "MeasureUnitId")
select i::varchar, concat('TestMat', i::varchar), '', 1, 1
from generate_series(200, 100000) i
然后,我们为所有的物料都插入一条库存记录:
INSERT INTO mm_inventory
("Id", "IsLocked", "Locker", "WarehouseId", "Slot", "Scope", "MaterialCode", "MaterialId", "Batch", "ExternalCode", "Amount", "StockKind", "Note", "ProducedAt", "ExpiredAt", "CreatedAt")
SELECT
gen_random_uuid() AS "Id", -- 生成唯一的 UUID
FALSE AS "IsLocked", -- 假定库存项默认未锁定
'' AS "Locker", -- 默认锁定者为空
100 AS "WarehouseId", -- 假定仓库 ID
'' AS "Slot", -- 默认货位为空
'' AS "Scope", -- 默认作用域为空
"Code" AS "MaterialCode", -- 使用 Material 表中的 Code 作为 MaterialCode
"Id" AS "MaterialId", -- 使用 Material 表中的 Id
'' AS "Batch", -- 默认批次为空
'' AS "ExternalCode", -- 默认外部代码为空
0 AS "Amount", -- 默认库存量为 0
1 AS "StockKind", -- 假定库存类型为 1
NULL AS "Note", -- 默认备注为空
NOW() AS "ProducedAt", -- 假定当前时间为生产时间
NOW() + INTERVAL '365 days' AS "ExpiredAt", -- 默认过期时间为一年后
NOW() AS "CreatedAt" -- 当前时间为创建时间
FROM mm_material;
根据物料名称查库存
有一个常见的场景是根据物料名称,查询对应的库存:
SELECT m."Id", m."Amount", m."Batch", m."CreatedAt", m."ExpiredAt", m."ExternalCode", m."IsLocked", m."Locker", m."MaterialCode", m."MaterialId", m."Note", m."ProducedAt", m.xmin, m."Scope", m."Slot", m."StockKind", m."WarehouseId"
FROM mm_inventory AS m
INNER JOIN mm_material AS m0 ON m."MaterialId" = m0."Id"
WHERE m."StockKind" IN (1, 2, 3) AND m0."ShortName" LIKE '%202%' ESCAPE '\'
LIMIT 10 OFFSET 0
这个查询最大的问题是,ShortName
是模糊查询'%202%',而且是左右两侧都含有%
通配符,这意味着它无法利用B-Tree
索引。explain analyze
会输出:
"Limit (cost=0.29..2064.76 rows=10 width=102) (actual time=0.014..0.197 rows=10 loops=1)"
" -> Nested Loop (cost=0.29..2064.76 rows=10 width=102) (actual time=0.013..0.195 rows=10 loops=1)"
" -> Seq Scan on mm_material m0 (cost=0.00..1981.53 rows=10 width=4) (actual time=0.006..0.169 rows=10 loops=1)"
" Filter: ((""ShortName"")::text ~~ '%202%'::text)"
" Rows Removed by Filter: 1819"
" -> Index Scan using ""IX_mm_inventory_MaterialId"" on mm_inventory m (cost=0.29..8.31 rows=1 width=102) (actual time=0.002..0.002 rows=1 loops=10)"
" Index Cond: (""MaterialId"" = m0.""Id"")"
" Filter: (""StockKind"" = ANY ('{1,2,3}'::integer[]))"
"Planning Time: 0.221 ms"
"Execution Time: 0.213 ms"
注意上面的Seq Scan on mm_material
,查询条件 Filter: ((""ShortName"")::text ~~ '%202%'::text)
。显然这步查询是全表扫描,效率极低。
EFCore 的默认索引类型
也许你会说,可以为ShortName
加上索引即可。比如如果我们使用EFCore,通常会使用HasIndex()
配置索引:
internal class MaterialEntityTypeConfiguration : IEntityTypeConfiguration<Material>
{
public void Configure(EntityTypeBuilder<Material> builder)
{
builder.ToTable("mm_material");
// ...
// 用于通过名称和规格型号来检索物料
builder.HasIndex(m => new { m.ShortName, m.Specification });
}
}
但是问题是,这并不会有任何改善。因为EFCore
配置的.HasIndex()
默认创建的是B-Tree索引。而这里的查询条件是%xxx%
,无法利用B-Tree
索引。
倒排索引
一个简单办法是使用倒排索引:
CREATE EXTENSION IF NOT EXISTS pg_trgm;
CREATE INDEX ON mm_material USING gin (
"ShortName" gin_trgm_ops,
"Specification" gin_trgm_ops
);
这是我们再explain analyze
一下,输出类似于:
"Limit (cost=12.37..131.94 rows=10 width=102) (actual time=0.046..0.070 rows=10 loops=1)"
" -> Nested Loop (cost=12.37..131.94 rows=10 width=102) (actual time=0.045..0.069 rows=10 loops=1)"
" -> Bitmap Heap Scan on mm_material m0 (cost=12.08..48.70 rows=10 width=4) (actual time=0.039..0.044 rows=10 loops=1)"
" Recheck Cond: ((""ShortName"")::text ~~ '%202%'::text)"
" Heap Blocks: exact=3"
" -> Bitmap Index Scan on ""mm_material_ShortName_Specification_idx"" (cost=0.00..12.07 rows=10 width=0) (actual time=0.027..0.027 rows=299 loops=1)"
" Index Cond: ((""ShortName"")::text ~~ '%202%'::text)"
" -> Index Scan using ""IX_mm_inventory_MaterialId"" on mm_inventory m (cost=0.29..8.31 rows=1 width=102) (actual time=0.002..0.002 rows=1 loops=10)"
" Index Cond: (""MaterialId"" = m0.""Id"")"
" Filter: (""StockKind"" = ANY ('{1,2,3}'::integer[]))"
注意Seq Scan on mm_material m0
变成了:
Bitmap Heap Scan on mm_material
-> Bitmap Index Scan on ""mm_material_ShortName_Specification_idx""
查询的消耗时间也从0.197s 变成了0.070s。
Npgsql对EFCore中索引的扩展支持
尽管EFCore没有对B-Tree以外的索引进行支持,但是Npgsql提供了针对Postgre的专门扩展.
索引方式:
protected override void OnModelCreating(ModelBuilder modelBuilder)
=> modelBuilder.Entity<Blog>()
.HasIndex(b => b.Url)
.HasMethod("gin");
这会被翻译成PGSQL中的USING <method> clause
。
索引算子类型:
protected override void OnConfiguring(DbContextOptionsBuilder builder)
=> modelBuilder.Entity<Blog>()
.HasIndex(b => new { b.Id, b.Name })
.HasOperators(null, "text_pattern_ops");
这会生成如下SQL代码:
CREATE INDEX "IX_blogs_Id_Name" ON blogs (
"Id",
"Name" text_pattern_ops
);
文章中的实用建议和操作指南,让读者受益匪浅,值得珍藏。
在现有基础上可尝试多媒介形式呈现。
文化差异分析可再深化以避免误读。