add hash table index type
This commit is contained in:
211
addindex.c
211
addindex.c
@@ -10,6 +10,8 @@
|
||||
#include "uint32.h"
|
||||
#include "mstorage.h"
|
||||
#include <errmsg.h>
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
mstorage_t idx;
|
||||
char* map;
|
||||
@@ -30,9 +32,39 @@ int compari(const void* a,const void* b) {
|
||||
return *(uint32*)b-*(uint32*)a;
|
||||
}
|
||||
|
||||
uint32 hash(const unsigned char* c,unsigned long keylen) {
|
||||
unsigned long h=0;
|
||||
unsigned long i;
|
||||
for (i=0; i<keylen; ++i) {
|
||||
/* from djb's cdb */
|
||||
h += (h<<5);
|
||||
h ^= c[i];
|
||||
}
|
||||
return (uint32)h;
|
||||
}
|
||||
|
||||
uint32 hash_tolower(const unsigned char* c,unsigned long keylen) {
|
||||
unsigned long h=0;
|
||||
unsigned long i;
|
||||
for (i=0; i<keylen; ++i) {
|
||||
/* from djb's cdb */
|
||||
h += (h<<5);
|
||||
h ^= tolower(c[i]);
|
||||
}
|
||||
return (uint32)h;
|
||||
}
|
||||
|
||||
uint32 hashmapped(uint32 ofs,int ignorecase) {
|
||||
unsigned char* c=(unsigned char*)map+ofs;
|
||||
uint32 len;
|
||||
if (*c) return ignorecase?hash_tolower(c,strlen((char*)c)):hash(c,strlen((char*)c));
|
||||
uint32_unpack(c+1,&len);
|
||||
return ignorecase?hash_tolower(c+5,len):hash(c+5,len);
|
||||
}
|
||||
|
||||
int main(int argc,char* argv[]) {
|
||||
enum { SORTEDTABLE, HASHTABLE } mode;
|
||||
long filelen;
|
||||
unsigned long filelen;
|
||||
char* filename=argv[1];
|
||||
uint32 magic,attribute_count,record_count,indices_offset,size_of_string_table;
|
||||
uint32 wanted,casesensitive,dn,objectClass;
|
||||
@@ -106,14 +138,14 @@ int main(int argc,char* argv[]) {
|
||||
mstorage_add(&idx,(char*)&i,4);
|
||||
++counted;
|
||||
x+=j*8;
|
||||
} else if (wanted==objectClass) {
|
||||
uint32_unpack(x+12,&k);
|
||||
mstorage_add(&idx,(char*)&k,4);
|
||||
if (fastindex)
|
||||
mstorage_add(&idx,(char*)&i,4);
|
||||
++counted;
|
||||
x+=j*8;
|
||||
} else {
|
||||
if (wanted==objectClass) {
|
||||
uint32_unpack(x+12,&k);
|
||||
mstorage_add(&idx,(char*)&k,4);
|
||||
if (fastindex)
|
||||
mstorage_add(&idx,(char*)&i,4);
|
||||
++counted;
|
||||
}
|
||||
x+=16;
|
||||
for (; j>2; --j) {
|
||||
uint32_unpack(x,&k);
|
||||
@@ -169,6 +201,169 @@ int main(int argc,char* argv[]) {
|
||||
}
|
||||
}
|
||||
} else if (mode==HASHTABLE) {
|
||||
uint32 i,j,counted,cur;
|
||||
char* x;
|
||||
struct node {
|
||||
uint32 recnum,hashcode;
|
||||
}* y;
|
||||
struct htentry {
|
||||
uint32 count;
|
||||
uint32* x;
|
||||
}* tab;
|
||||
uint32 maxtabsize;
|
||||
uint32 maxcoll,mincoll,cmaxcoll,nmaxcoll,nmincoll,cmincoll;
|
||||
uint32 indexsize;
|
||||
cmaxcoll=cmincoll=0; /* shut gcc up */
|
||||
if (wanted==dn)
|
||||
counted=record_count;
|
||||
else {
|
||||
x=map+5*4+size_of_string_table+attribute_count*8;
|
||||
counted=0;
|
||||
for (i=0; i<record_count; ++i) {
|
||||
uint32 j,k;
|
||||
uint32_unpack(x,&j);
|
||||
if (wanted==objectClass)
|
||||
++counted;
|
||||
x+=16;
|
||||
for (; j>2; --j) {
|
||||
uint32_unpack(x,&k);
|
||||
if (k==wanted)
|
||||
++counted;
|
||||
x+=8;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!counted) die(111,"attribute does not occur?!");
|
||||
y=malloc(counted*sizeof(struct node));
|
||||
if (!y) die(111,"out of memory");
|
||||
x=map+5*4+size_of_string_table+attribute_count*8;
|
||||
for (cur=i=0; i<record_count; ++i) {
|
||||
uint32 k;
|
||||
uint32_unpack(x,&j);
|
||||
if (wanted==dn) {
|
||||
uint32_unpack(x+8,&k);
|
||||
y[cur].recnum=i;
|
||||
y[cur].hashcode=hashmapped(k,ignorecase);
|
||||
++cur;
|
||||
x+=j*8;
|
||||
} else {
|
||||
if (wanted==objectClass) {
|
||||
uint32_unpack(x+12,&k);
|
||||
y[cur].recnum=i;
|
||||
y[cur].hashcode=hashmapped(k,ignorecase);
|
||||
++cur;
|
||||
}
|
||||
x+=16;
|
||||
for (; j>2; --j) {
|
||||
uint32_unpack(x,&k);
|
||||
if (k==wanted) {
|
||||
y[cur].recnum=i;
|
||||
y[cur].hashcode=hashmapped(k,ignorecase);
|
||||
++cur;
|
||||
}
|
||||
x+=8;
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer_putulong(buffer_1,counted);
|
||||
buffer_putsflush(buffer_1," entries hashed; looking for hash table size with least collisions...");
|
||||
i=counted;
|
||||
if (!(i&1)) ++i;
|
||||
maxtabsize=counted+counted/8;
|
||||
tab=malloc(maxtabsize*sizeof(struct htentry));
|
||||
if (!tab) die(111,"out of memory");
|
||||
maxcoll=nmaxcoll=nmincoll=0; mincoll=-1;
|
||||
for (; i<maxtabsize; ++i) {
|
||||
uint32 j,k,chains;
|
||||
memset(tab,0,i*sizeof(struct htentry));
|
||||
for (j=k=chains=0; j<counted; ++j) {
|
||||
uint32 l=y[j].hashcode%i;
|
||||
if (++tab[l].count>1) ++k;
|
||||
if (tab[l].count==2) ++chains;
|
||||
}
|
||||
if (k>maxcoll) {
|
||||
nmaxcoll=i;
|
||||
maxcoll=k;
|
||||
cmaxcoll=chains;
|
||||
}
|
||||
if (k<mincoll) {
|
||||
nmincoll=i;
|
||||
mincoll=k;
|
||||
cmincoll=chains;
|
||||
}
|
||||
}
|
||||
buffer_putsflush(buffer_1," done.\n");
|
||||
buffer_puts(buffer_1,"minimum collisions at ");
|
||||
buffer_putulong(buffer_1,nmincoll);
|
||||
buffer_puts(buffer_1,": ");
|
||||
buffer_putulong(buffer_1,mincoll);
|
||||
buffer_puts(buffer_1," (");
|
||||
buffer_putulong(buffer_1,cmincoll);
|
||||
buffer_puts(buffer_1," chains), maximum collisions at ");
|
||||
buffer_putulong(buffer_1,nmaxcoll);
|
||||
buffer_puts(buffer_1,": ");
|
||||
buffer_putulong(buffer_1,maxcoll);
|
||||
buffer_puts(buffer_1," (");
|
||||
buffer_putulong(buffer_1,cmaxcoll);
|
||||
buffer_putsflush(buffer_1," chains).\n");
|
||||
|
||||
maxtabsize=nmincoll;
|
||||
memset(tab,0,maxtabsize*sizeof(struct htentry));
|
||||
|
||||
for (j=0; j<counted; ++j) {
|
||||
uint32 l=y[j].hashcode%maxtabsize;
|
||||
tab[l].x=realloc(tab[l].x,(++tab[l].count)*sizeof(tab[l].x[0]));
|
||||
if (!tab[l].x) die(111,"out of memory");
|
||||
tab[l].x[tab[l].count-1]=y[j].recnum;
|
||||
}
|
||||
|
||||
indexsize=4*4+maxtabsize*4;
|
||||
for (j=0; j<maxtabsize; ++j)
|
||||
if (tab[j].count>1)
|
||||
indexsize+=(tab[j].count+1)*4;
|
||||
|
||||
free(y);
|
||||
munmap(map,filelen);
|
||||
|
||||
{
|
||||
int fd=open(filename,O_RDWR);
|
||||
char* dest,* x,* z;
|
||||
if (fd<0)
|
||||
diesys(111,"Could not re-open database file read-write");
|
||||
ftruncate(fd,filelen+indexsize);
|
||||
map=mmap(0,filelen+indexsize,PROT_WRITE,MAP_SHARED,fd,0);
|
||||
if (map==(char*)-1)
|
||||
diesys(111,"Could not mmap database file read-write");
|
||||
uint32_pack(map+casesensitive,ignorecase);
|
||||
dest=map+filelen;
|
||||
uint32_pack(dest,3); /* index type 3 == hash table */
|
||||
uint32_pack(dest+4,filelen+indexsize); /* offset of next index */
|
||||
uint32_pack(dest+2*4,wanted); /* indexed attribute */
|
||||
uint32_pack(dest+3*4,maxtabsize); /* hash table size in uint32s */
|
||||
x=dest+4*4;
|
||||
z=x+maxtabsize*4;
|
||||
for (j=0; j<maxtabsize; ++j) {
|
||||
if (tab[j].count==0) {
|
||||
uint32_pack(x,0);
|
||||
x+=4;
|
||||
} else if (tab[j].count==1) {
|
||||
uint32_pack(x,tab[j].x[0]);
|
||||
x+=4;
|
||||
} else if (tab[j].count>1) {
|
||||
uint32 k;
|
||||
uint32_pack(x,filelen+(z-dest));
|
||||
x+=4;
|
||||
uint32_pack(z,tab[j].count);
|
||||
z+=4;
|
||||
for (k=0; k<tab[j].count; ++k) {
|
||||
uint32_pack(z,tab[j].x[k]);
|
||||
z+=4;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
munmap(map,filelen);
|
||||
} else
|
||||
die(1,"invalid index type requested");
|
||||
return 0;
|
||||
|
||||
Reference in New Issue
Block a user