add hash table index type

This commit is contained in:
leitner
2005-10-13 16:51:44 +00:00
parent b4a257d0ad
commit 7376f0342c
4 changed files with 269 additions and 10 deletions

View File

@@ -10,6 +10,8 @@
#include "uint32.h"
#include "mstorage.h"
#include <errmsg.h>
#include <ctype.h>
#include <stdlib.h>
mstorage_t idx;
char* map;
@@ -30,9 +32,39 @@ int compari(const void* a,const void* b) {
return *(uint32*)b-*(uint32*)a;
}
uint32 hash(const unsigned char* c,unsigned long keylen) {
unsigned long h=0;
unsigned long i;
for (i=0; i<keylen; ++i) {
/* from djb's cdb */
h += (h<<5);
h ^= c[i];
}
return (uint32)h;
}
uint32 hash_tolower(const unsigned char* c,unsigned long keylen) {
unsigned long h=0;
unsigned long i;
for (i=0; i<keylen; ++i) {
/* from djb's cdb */
h += (h<<5);
h ^= tolower(c[i]);
}
return (uint32)h;
}
uint32 hashmapped(uint32 ofs,int ignorecase) {
unsigned char* c=(unsigned char*)map+ofs;
uint32 len;
if (*c) return ignorecase?hash_tolower(c,strlen((char*)c)):hash(c,strlen((char*)c));
uint32_unpack(c+1,&len);
return ignorecase?hash_tolower(c+5,len):hash(c+5,len);
}
int main(int argc,char* argv[]) {
enum { SORTEDTABLE, HASHTABLE } mode;
long filelen;
unsigned long filelen;
char* filename=argv[1];
uint32 magic,attribute_count,record_count,indices_offset,size_of_string_table;
uint32 wanted,casesensitive,dn,objectClass;
@@ -106,14 +138,14 @@ int main(int argc,char* argv[]) {
mstorage_add(&idx,(char*)&i,4);
++counted;
x+=j*8;
} else if (wanted==objectClass) {
uint32_unpack(x+12,&k);
mstorage_add(&idx,(char*)&k,4);
if (fastindex)
mstorage_add(&idx,(char*)&i,4);
++counted;
x+=j*8;
} else {
if (wanted==objectClass) {
uint32_unpack(x+12,&k);
mstorage_add(&idx,(char*)&k,4);
if (fastindex)
mstorage_add(&idx,(char*)&i,4);
++counted;
}
x+=16;
for (; j>2; --j) {
uint32_unpack(x,&k);
@@ -169,6 +201,169 @@ int main(int argc,char* argv[]) {
}
}
} else if (mode==HASHTABLE) {
uint32 i,j,counted,cur;
char* x;
struct node {
uint32 recnum,hashcode;
}* y;
struct htentry {
uint32 count;
uint32* x;
}* tab;
uint32 maxtabsize;
uint32 maxcoll,mincoll,cmaxcoll,nmaxcoll,nmincoll,cmincoll;
uint32 indexsize;
cmaxcoll=cmincoll=0; /* shut gcc up */
if (wanted==dn)
counted=record_count;
else {
x=map+5*4+size_of_string_table+attribute_count*8;
counted=0;
for (i=0; i<record_count; ++i) {
uint32 j,k;
uint32_unpack(x,&j);
if (wanted==objectClass)
++counted;
x+=16;
for (; j>2; --j) {
uint32_unpack(x,&k);
if (k==wanted)
++counted;
x+=8;
}
}
}
if (!counted) die(111,"attribute does not occur?!");
y=malloc(counted*sizeof(struct node));
if (!y) die(111,"out of memory");
x=map+5*4+size_of_string_table+attribute_count*8;
for (cur=i=0; i<record_count; ++i) {
uint32 k;
uint32_unpack(x,&j);
if (wanted==dn) {
uint32_unpack(x+8,&k);
y[cur].recnum=i;
y[cur].hashcode=hashmapped(k,ignorecase);
++cur;
x+=j*8;
} else {
if (wanted==objectClass) {
uint32_unpack(x+12,&k);
y[cur].recnum=i;
y[cur].hashcode=hashmapped(k,ignorecase);
++cur;
}
x+=16;
for (; j>2; --j) {
uint32_unpack(x,&k);
if (k==wanted) {
y[cur].recnum=i;
y[cur].hashcode=hashmapped(k,ignorecase);
++cur;
}
x+=8;
}
}
}
buffer_putulong(buffer_1,counted);
buffer_putsflush(buffer_1," entries hashed; looking for hash table size with least collisions...");
i=counted;
if (!(i&1)) ++i;
maxtabsize=counted+counted/8;
tab=malloc(maxtabsize*sizeof(struct htentry));
if (!tab) die(111,"out of memory");
maxcoll=nmaxcoll=nmincoll=0; mincoll=-1;
for (; i<maxtabsize; ++i) {
uint32 j,k,chains;
memset(tab,0,i*sizeof(struct htentry));
for (j=k=chains=0; j<counted; ++j) {
uint32 l=y[j].hashcode%i;
if (++tab[l].count>1) ++k;
if (tab[l].count==2) ++chains;
}
if (k>maxcoll) {
nmaxcoll=i;
maxcoll=k;
cmaxcoll=chains;
}
if (k<mincoll) {
nmincoll=i;
mincoll=k;
cmincoll=chains;
}
}
buffer_putsflush(buffer_1," done.\n");
buffer_puts(buffer_1,"minimum collisions at ");
buffer_putulong(buffer_1,nmincoll);
buffer_puts(buffer_1,": ");
buffer_putulong(buffer_1,mincoll);
buffer_puts(buffer_1," (");
buffer_putulong(buffer_1,cmincoll);
buffer_puts(buffer_1," chains), maximum collisions at ");
buffer_putulong(buffer_1,nmaxcoll);
buffer_puts(buffer_1,": ");
buffer_putulong(buffer_1,maxcoll);
buffer_puts(buffer_1," (");
buffer_putulong(buffer_1,cmaxcoll);
buffer_putsflush(buffer_1," chains).\n");
maxtabsize=nmincoll;
memset(tab,0,maxtabsize*sizeof(struct htentry));
for (j=0; j<counted; ++j) {
uint32 l=y[j].hashcode%maxtabsize;
tab[l].x=realloc(tab[l].x,(++tab[l].count)*sizeof(tab[l].x[0]));
if (!tab[l].x) die(111,"out of memory");
tab[l].x[tab[l].count-1]=y[j].recnum;
}
indexsize=4*4+maxtabsize*4;
for (j=0; j<maxtabsize; ++j)
if (tab[j].count>1)
indexsize+=(tab[j].count+1)*4;
free(y);
munmap(map,filelen);
{
int fd=open(filename,O_RDWR);
char* dest,* x,* z;
if (fd<0)
diesys(111,"Could not re-open database file read-write");
ftruncate(fd,filelen+indexsize);
map=mmap(0,filelen+indexsize,PROT_WRITE,MAP_SHARED,fd,0);
if (map==(char*)-1)
diesys(111,"Could not mmap database file read-write");
uint32_pack(map+casesensitive,ignorecase);
dest=map+filelen;
uint32_pack(dest,3); /* index type 3 == hash table */
uint32_pack(dest+4,filelen+indexsize); /* offset of next index */
uint32_pack(dest+2*4,wanted); /* indexed attribute */
uint32_pack(dest+3*4,maxtabsize); /* hash table size in uint32s */
x=dest+4*4;
z=x+maxtabsize*4;
for (j=0; j<maxtabsize; ++j) {
if (tab[j].count==0) {
uint32_pack(x,0);
x+=4;
} else if (tab[j].count==1) {
uint32_pack(x,tab[j].x[0]);
x+=4;
} else if (tab[j].count>1) {
uint32 k;
uint32_pack(x,filelen+(z-dest));
x+=4;
uint32_pack(z,tab[j].count);
z+=4;
for (k=0; k<tab[j].count; ++k) {
uint32_pack(z,tab[j].x[k]);
z+=4;
}
}
}
}
munmap(map,filelen);
} else
die(1,"invalid index type requested");
return 0;