[20170402]函数索引standard_hash.txt
--//这几天放假,在家里看书<<Apress.Expert.Oracle.Indexing.and.Access.Paths.Maximum.Performance.for.Your.Database.2nd.Edition.148421983X.pdf>>
--//这本书比较合适初学者,我以前看过第一版,所以这个版本看的很快.
--//里面提到函数standard_hash,可以用来满足一般的查询,做一个记录.
1.环境:
SCOTT@test01p> @ ver1
PORT_STRING VERSION BANNER CON_ID
------------------------------ -------------- -------------------------------------------------------------------------------- ----------
IBMPC/WIN_NT64-9.1.0 12.1.0.1.0 Oracle Database 12c Enterprise Edition Release 12.1.0.1.0 - 64bit Production 0
2.测试数据:
create table t (id number ,v1 varchar2(200) ,v2 nvarchar2(200));
insert into t
select level
,lpad('x',100,'x')||level v1
,lpad('a',100,'a')||level v2
from dual
connect by level < 2e5;
create index i_t_v1 on t(v1);
create index i_t_v2 on t(v2);
--//分析略.
SCOTT@test01p> select segment_name,header_file,header_block, BYTES,BLOCKS from dba_segments where owner=user and segment_name in ('I_T_V1','I_T_V2');
SEGMENT_NAME HEADER_FILE HEADER_BLOCK BYTES BLOCKS
-------------------- ----------- ------------ ---------- ----------
I_T_V1 9 362 28311552 3456
I_T_V2 9 10602 54525952 6656
--//因为nvarchar2类型的存储问题,导致在v2字段建立的索引比在v1字段建立的索引大许多.
3.测试:
select * from t where v2=lpad('a',100,'a')||42;
Plan hash value: 970572957
-----------------------------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
-----------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 5 (100)| |
| 1 | TABLE ACCESS BY INDEX ROWID BATCHED| T | 1 | 323 | 5 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | I_T_V2 | 1 | | 4 (0)| 00:00:01 |
-----------------------------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
2 - SEL$1 / T@SEL$1
select * from t where v1=lpad('x',100,'x')||42;
Plan hash value: 1965038091
-----------------------------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
-----------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 4 (100)| |
| 1 | TABLE ACCESS BY INDEX ROWID BATCHED| T | 1 | 323 | 4 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | I_T_V1 | 1 | | 3 (0)| 00:00:01 |
-----------------------------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
2 - SEL$1 / T@SEL$1
--//都能很好的使用索引,而如果建立直方图呢?这也是我生产系统遇到的问题.
3.建立直方图:
SCOTT@test01p> execute sys.dbms_stats.gather_table_stats ( OwnName => user,TabName => 't',Estimate_Percent => NULL,Method_Opt => 'FOR ALL COLUMNS SIZE auto ',Cascade => True ,No_Invalidate => false)
PL/SQL procedure successfully completed.
SCOTT@test01p> @ tab_lh scott T ''
DISPLAY TABLE_NAME OF COLUMN_NAME INFORMATION.
INPUT OWNER TABLE_NAME COLUMN
SAMPLE : @ TAB_LH TABLE_NAME [COLUMN_NAME]
IF NOT INPUT COLUMN_NAME ,USE "" .
COLUMN_NAME DATA_TYPE DATA_LENGTH N NUM_DISTINCT DENSITY SAMPLE_SIZE TRANS_LOW TRANS_HIGH NUM_NULLS NUM_BUCKETS LAST_ANALYZED HISTOGRAM DATA_DEFAULT
------------------------ ---------- ----------- - ------------ ---------- ----------- -------------------------------- -------------------------------- ---------- ----------- ------------------- --------------- ---------------
ID NUMBER 22 Y 199999 .000005 199999 1 199999 0 1 2017-04-04 21:14:52 NONE
V1 VARCHAR2 200 Y 1 .0000025 199999 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 0 1 2017-04-04 21:14:52 FREQUENCY
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
V2 NVARCHAR2 400 Y 1 .0000025 199999 0 1 2017-04-04 21:14:52 FREQUENCY
--//可以发现建立了直方图.可以发现v1,v2 NUM_DISTINCT=1.
select * from t where v2=lpad('a',100,'a')||42;
Plan hash value: 1601196873
---------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
---------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 2739 (100)| |
|* 1 | TABLE ACCESS FULL| T | 199K| 61M| 2739 (1)| 00:00:01 |
---------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
select * from t where v1=lpad('x',100,'x')||42;
Plan hash value: 1601196873
---------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
---------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 2739 (100)| |
|* 1 | TABLE ACCESS FULL| T | 199K| 61M| 2739 (1)| 00:00:01 |
---------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
--//因为11G下对于建立直方图仅仅对前面32个字符敏感,而对于nvarchar2类型仅仅对前面16个字符敏感(注与安装时选择的字符集有关).
--//我一般选择国家字符集AL16UTF16,看来以后越来越趋向使用AL32UTF8.这样缺省情况下直方图变成了鸡肋,毫无用处,执行计划选择了全表扫描.
4.这样可以使用一种特殊的方法解决,就算反转索引:
drop index i_t_v2;
create index if_t_v2 on t(v2 desc);
select * from t where v2=lpad('a',100,'a')||42;
Plan hash value: 2068059304
------------------------------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 43 (100)| |
| 1 | TABLE ACCESS BY INDEX ROWID BATCHED| T | 1 | 323 | 43 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | IF_T_V2 | 1 | | 42 (0)| 00:00:01 |
------------------------------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
2 - SEL$1 / T@SEL$1
--//注:反转索引包含NULL,这点注意.而且不合适范围扫描.比如like.
--//可以发现使用索引.
--//即使建立直方图也一样.
SCOTT@test01p> execute sys.dbms_stats.gather_table_stats ( OwnName => nvl('',user),TabName => 't',Estimate_Percent => NULL,Method_Opt => 'FOR ALL COLUMNS SIZE auto for all hidden columns size 254 ',Cascade => True ,No_Invalidate => false)
SCOTT@test01p> @ tab_lh scott T ''
DISPLAY TABLE_NAME OF COLUMN_NAME INFORMATION.
INPUT OWNER TABLE_NAME COLUMN
SAMPLE : @ TAB_LH TABLE_NAME [COLUMN_NAME]
IF NOT INPUT COLUMN_NAME ,USE "" .
COLUMN_NAME DATA_TYPE DATA_LENGTH N NUM_DISTINCT DENSITY SAMPLE_SIZE TRANS_LOW TRANS_HIGH NUM_NULLS NUM_BUCKETS LAST_ANALYZED HISTOGRAM DATA_DEFAULT
------------------------ ---------- ----------- - ------------ ---------- ----------- -------------------------------- -------------------------------- ---------- ----------- ------------------- --------------- --------------------
ID NUMBER 22 Y 199999 .000005 199999 1 199999 0 1 2017-04-04 21:28:13 NONE
V1 VARCHAR2 200 Y 199999 .000005 199999 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 0 1 2017-04-04 21:28:13 NONE
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
V2 NVARCHAR2 400 Y 199999 .000005 199999 0 1 2017-04-04 21:28:13 NONE
SYS_NC00004$ RAW 601 Y 1 .0000025 199999 0 1 2017-04-04 21:29:00 FREQUENCY "V2"
--//这样又会选择全表扫描.SYS_NC00004$ 的 NUM_DISTINCT=1.
--//同时又产生一个问题,如果字段很长,索引很大,而使用函数standard_hash,这样就很好解决这个问题.
--//而不用修改sql语句.
drop index if_t_v2;
create index if_t_v2 on t(standard_hash(v2));
--//重新分析,建立函数索引最好重新分析,因为缺省oracle不会在这个字段上建立一些统计信息.
SCOTT@test01p> select segment_name,header_file,header_block, BYTES,BLOCKS from dba_segments where owner=user and segment_name in ('I_T_V1','IF_T_V2');
SEGMENT_NAME HEADER_FILE HEADER_BLOCK BYTES BLOCKS
-------------------- ----------- ------------ ---------- ----------
I_T_V1 9 362 28311552 3456
IF_T_V2 9 10602 8388608 1024
--//对比前面可以发现IF_T_V2占用空间小了许多.
select * from t where v2=lpad('a',100,'a')||42;
Plan hash value: 2068059304
------------------------------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 4 (100)| |
|* 1 | TABLE ACCESS BY INDEX ROWID BATCHED| T | 1 | 344 | 4 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | IF_T_V2 | 1 | | 3 (0)| 00:00:01 |
------------------------------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
2 - SEL$1 / T@SEL$1
--//当然对于范围扫描一样无效.
--//建立直方图呢?
SCOTT@test01p> execute sys.dbms_stats.gather_table_stats ( OwnName => nvl('',user),TabName => 't',Estimate_Percent => NULL,Method_Opt => 'FOR ALL COLUMNS SIZE auto for all hidden columns size 254 ',Cascade => True ,No_Invalidate => false)
PL/SQL procedure successfully completed.
DISPLAY TABLE_NAME OF COLUMN_NAME INFORMATION.
INPUT OWNER TABLE_NAME COLUMN
SAMPLE : @ TAB_LH TABLE_NAME [COLUMN_NAME]
IF NOT INPUT COLUMN_NAME ,USE "" .
COLUMN_NAME DATA_TYPE DATA_LENGTH N NUM_DISTINCT DENSITY SAMPLE_SIZE TRANS_LOW TRANS_HIGH NUM_NULLS NUM_BUCKETS LAST_ANALYZED HISTOGRAM DATA_DEFAULT
------------------------ ---------- ----------- - ------------ ---------- ----------- -------------------------------- -------------------------------- ---------- ----------- ------------------- --------------- --------------------
ID NUMBER 22 Y 199999 .000005 199999 1 199999 0 1 2017-04-04 21:28:13 NONE
V1 VARCHAR2 200 Y 199999 .000005 199999 xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 0 1 2017-04-04 21:28:13 NONE
xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
V2 NVARCHAR2 400 Y 199999 .000005 199999 0 1 2017-04-04 21:28:13 NONE
SYS_NC00004$ RAW 20 Y 200000 .000005 200000 0 254 2017-04-04 21:57:22 HYBRID STANDARD_HASH("V2")
--//可以发现SYS_NC00004$ NUM_DISTINCT=200000.
select * from t where v2=lpad('a',100,'a')||42
Plan hash value: 2068059304
------------------------------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 4 (100)| |
|* 1 | TABLE ACCESS BY INDEX ROWID BATCHED| T | 1 | 344 | 4 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | IF_T_V2 | 1 | | 3 (0)| 00:00:01 |
------------------------------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
2 - SEL$1 / T@SEL$1
--//依旧会使用索引.
5.//这样就提出了一个索引建立思路,如果要建立的索引字段很长,而没有范围查询要求,可以通过建立standard_hash函数索引,来解决这
--//个问题.另外我还测试ora_hash函数,这个无法解决这个问题.要修改sql语句,
--//另外这个函数遇到clob字段会出现异常.http://connormcdonald.wordpress.com/2013/03/29/ora_hash-and-lobs-not-nice-partners/.
--//重复测试:
create table t1 (a number,b varchar2(20),c clob);
insert into t1 values (1,'a','a');
commit ;
SCOTT@test01p> select ora_hash(b),ora_hash(c) from t1;
ORA_HASH(B) ORA_HASH(C)
----------- -----------
29220167 1172292995
--//等3秒.
SCOTT@test01p> select ora_hash(b),ora_hash(c) from t1;
ORA_HASH(B) ORA_HASH(C)
----------- -----------
29220167 4224997194
--//可以发现ORA_HASH(C)发生了改变. 12c执行更长的字符串,可以达到32k.我以前提过超过4000字段,应该按照lob来保存.
--//http://blog.itpub.net/267265/viewspace-776806/
drop table t1 purge;
create table t1 (id number ,v1 varchar2(32767) );
insert into t1 values (1,lpad('n',100,'n'));
insert into t1 values (2,lpad('n',30000,'n'));
insert into t1 values (3,'a');
commit ;
select dump(ora_hash(v1) from t1;
--sleep 3
select dump(ora_hash(v1) from t1;
SCOTT@test01p> select id,dump(ora_hash(v1)) c40,ora_hash(v1) from t1;
ID C40 ORA_HASH(V1)
---------- ---------------------------------------- ------------
1 Typ=2 Len=6: 197,40,14,10,79,24 3913097823
2 Typ=2 Len=6: 197,12,8,62,3,83 1107610282
3 Typ=2 Len=5: 196,30,23,2,68 29220167
--//等3秒.
SCOTT@test01p> select id,dump(ora_hash(v1)) c40,ora_hash(v1) from t1;
ID C40 ORA_HASH(V1)
---------- ---------------------------------------- ------------
1 Typ=2 Len=6: 197,40,14,10,79,24 3913097823
2 Typ=2 Len=6: 197,12,8,62,3,83 1107610282
3 Typ=2 Len=5: 196,30,23,2,68 29220167
--//没有使用clob字段类型的问题.还是不推荐这样使用.
总结:
1.可以发现函数standard_hash建立索引,可以用于等值查询,实际上如果你看执行计划可以发现有一点点不同.
select * from t where v2=lpad('a',100,'a')||42;
Plan hash value: 2068059304
------------------------------------------------------------------------------------------------
| Id | Operation | Name | E-Rows |E-Bytes| Cost (%CPU)| E-Time |
------------------------------------------------------------------------------------------------
| 0 | SELECT STATEMENT | | | | 4 (100)| |
|* 1 | TABLE ACCESS BY INDEX ROWID BATCHED| T | 1 | 344 | 4 (0)| 00:00:01 |
|* 2 | INDEX RANGE SCAN | IF_T_V2 | 1 | | 3 (0)| 00:00:01 |
------------------------------------------------------------------------------------------------
Query Block Name / Object Alias (identified by operation id):
-------------------------------------------------------------
1 - SEL$1 / T@SEL$1
2 - SEL$1 / T@SEL$1
Predicate Information (identified by operation id):
---------------------------------------------------
1 - filter("V2"=U'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa42')
2 - access("T"."SYS_NC00004$"=HEXTORAW('6B59D7D1A31FB4B3B6820FEA9D234B1B352596A9'))
--//除了access,还多了1个filter.
2.实际上除了standard_hash,还有trunc,substr(字段,1,N)也可以用于等值查询,可以使用这样的函数索引,其他我还不知道.
但是standard_hash不能支持范围查询.但是带来是索引占用空间减少.
3.另外必须要注意一个问题,就是密集插入可能会消耗CPU资源,这点特别注意.
drop table t purge ;
create table t (id number ,v1 varchar2(200) ,v2 nvarchar2(200));
create index i_t_v2 on t(v2);
set timing on
insert into t
select level
,lpad('x',100,'x')||level v1
,lpad('a',100,'a')||level v2
from dual
connect by level < 2e5;
Elapsed: 00:00:46.01
--//使用46秒.
--//重复测试:
drop table t purge ;
create table t (id number ,v1 varchar2(200) ,v2 nvarchar2(200));
create index if_t_v2 on t(standard_hash(v2));
insert into t
select level
,lpad('x',100,'x')||level v1
,lpad('a',100,'a')||level v2
from dual
connect by level < 2e5;
Elapsed: 00:00:15.34
--//使用15秒.
--//重复测试使用索引:
drop table t purge ;
create table t (id number ,v1 varchar2(200) ,v2 nvarchar2(200));
insert into t
select level
,lpad('x',100,'x')||level v1
,lpad('a',100,'a')||level v2
from dual
connect by level < 2e5;
Elapsed: 00:00:02.04
--//使用2秒.
--//索引这种cpu运算强度还是很大的,前面第1种情况主要是建立索引消耗磁盘空间大.如果选择小的字符呢.
drop table t purge ;
create table t (id number ,v1 varchar2(200) ,v2 nvarchar2(200));
create index i_t_v2 on t(v2);
insert into t
select level
,lpad('x',190,'x')||level v1
,lpad('a',6,'a')||level v2
from dual
connect by level < 2e5;
Elapsed: 00:00:06.18
--//仅仅需要6秒.如果在这种的情况使用standard_hash索引呢?
drop table t purge ;
create table t (id number ,v1 varchar2(200) ,v2 nvarchar2(200));
create index if_t_v2 on t(standard_hash(v2));
insert into t
select level
,lpad('x',190,'x')||level v1
,lpad('a',6,'a')||level v2
from dual
connect by level < 2e5;
Elapsed: 00:00:12.46
--//字段长度小效果就不明显了.