Sunday, December 19, 2010

Postgresql case-insensitive design and searching choice

First, what not to do, don't use ilike...

create table y
(
id serial not null,
email text not null unique
);

insert into y(email) 
values('iSteve.jobs@apple.com') ,('linus.Torvalds@linUX.com');
insert into y(email) 
select n from generate_series(1,1000) as i(n);
create index ix_y on y(email);

explain select * from y 
where email ilike 
    ANY(ARRAY['ISteve.Jobs@Apple.com','Linus.Torvalds@Linux.com']);

...,as that would result to Sequential Scan:

memdb=# explain select * from y where email ilike ANY(ARRAY['ISteve.Jobs@Apple.com','Linus.Torvalds@Linux.com']);
                                       QUERY PLAN                                       
----------------------------------------------------------------------------------------
 Seq Scan on y  (cost=0.00..17.52 rows=1 width=7)
   Filter: (email ~~* ANY ('{ISteve.Jobs@Apple.com,Linus.Torvalds@Linux.com}'::text[]))
(2 rows)


It's either you create an indexed lower expression...

create function lower(t text[]) returns text[]
as
$$
select lower($1::text)::text[]
$$ language sql;

create unique index ix_y_2 on y(lower(email));

explain select * from y 
where lower(email) = 
    ANY(lower(ARRAY['ISteve.Jobs@Apple.com','Linus.Torvalds@Linux.com']));

...which properly uses index:
memdb=# explain select * from y where lower(email) = ANY(lower(ARRAY['ISteve.Jobs@Apple.com','Linus.Torvalds@Linux.com']));
                                                           QUERY PLAN                                                           
--------------------------------------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on y  (cost=22.60..27.98 rows=10 width=7)
   Recheck Cond: (lower(email) = ANY ((lower(('{ISteve.Jobs@Apple.com,Linus.Torvalds@Linux.com}'::text[])::text))::text[]))
   ->  Bitmap Index Scan on ix_y_2  (cost=0.00..22.60 rows=10 width=0)
         Index Cond: (lower(email) = ANY ((lower(('{ISteve.Jobs@Apple.com,Linus.Torvalds@Linux.com}'::text[])::text))::text[]))
(4 rows)


Or you use citext data type...

create table x
(
id serial not null,
email citext not null unique
);

insert into x(email) 
values('iSteve.jobs@apple.com'),('linus.Torvalds@linUX.com');
insert into x(email) 
select n from generate_series(1,1000) as i(n);
create index ix_x on x(email);

explain select * from x 
where email = 
ANY(ARRAY['ISteve.Jobs@Apple.com','Linus.Torvalds@Linux.com']::citext[]);

...which properly uses index even you don't create an index on expression (e.g. create index zzz on yyy(lower(field))):

memdb=# explain select * from x where email = ANY(ARRAY['ISteve.Jobs@Apple.com','Linus.Torvalds@Linux.com']::citext[]);
                                            QUERY PLAN                                            
--------------------------------------------------------------------------------------------------
 Bitmap Heap Scan on x  (cost=8.52..12.75 rows=2 width=7)
   Recheck Cond: (email = ANY ('{ISteve.Jobs@Apple.com,Linus.Torvalds@Linux.com}'::citext[]))
   ->  Bitmap Index Scan on ix_x  (cost=0.00..8.52 rows=2 width=0)
         Index Cond: (email = ANY ('{ISteve.Jobs@Apple.com,Linus.Torvalds@Linux.com}'::citext[]))
(4 rows)

Happy Indexing!

No comments:

Post a Comment