rework parse so that it can process larger files.

This commit is contained in:
leitner
2004-02-14 00:31:42 +00:00
parent a755f3154b
commit 0cad372ea0
9 changed files with 171 additions and 104 deletions

View File

@@ -1,4 +1,4 @@
#DEBUG=1
DEBUG=1
all: t1 t2 parse dumpidx idx2ldif addindex bindrequest tinyldap \
tinyldap_standalone tinyldap_debug ldapclient ldapclient_str \
@@ -28,7 +28,7 @@ ldif.a: ldif_parse.o ldap_match_mapped.o
storage.a: strstorage.o strduptab.o mstorage_add.o mduptab_add.o \
bstr_diff.o mduptab_adds.o bstr_diff2.o mstorage_add_bin.o \
mstorage_init.o mstorage_init_persistent.o mstorage_unmap.o \
mduptab_init.o
mduptab_init.o mduptab_init_reuse.o
auth.a: auth.o

View File

@@ -2,11 +2,20 @@
#include "mmap.h"
#include "uint32.h"
int main() {
int main(int argc,char* argv[]) {
int verbose=0;
unsigned long filelen;
char* map=mmap_read("data",&filelen);
char* fn=argc<2?"data":argv[1];
char* map=mmap_read(fn,&filelen);
uint32 magic,attribute_count,record_count,indices_offset,size_of_string_table;
if (!map) {
buffer_puts(buffer_2,"could not open `");
buffer_puts(buffer_2,fn);
buffer_puts(buffer_2,"´: ");
buffer_puterror(buffer_2);
buffer_putnlflush(buffer_2);
exit(1);
}
buffer_puts(buffer_1,"magic: ");
uint32_unpack(map,&magic);
uint32_unpack(map+4,&attribute_count);

View File

@@ -15,9 +15,18 @@
mduptab_t attributes,classes;
mstorage_t stringtable;
long dn, objectClass;
/* this is called after each record.
* If it returns -1, ldif_parse will exit immediately.
* If it returns 0, ldif_parse will continue parsing and overwrite the
* current ldaprec.
* If it returns 1, ldif_parse will allocate a new ldaprec and link it
* using the next pointer in the current ldaprec.
* If the callback is NULL, a callback that always returns 1 is assumed.
* */
int (*ldif_parse_callback)(struct ldaprec* l);
unsigned long ldifrecords;
static void addattribute(struct ldaprec** l,long name,long val) {
@@ -27,7 +36,11 @@ static void addattribute(struct ldaprec** l,long name,long val) {
(*l)->a[(*l)->n].value=val;
++(*l)->n;
} else {
buffer_putsflush(buffer_2,"LDIF parse error: too many attributes!\n");
buffer_puts(buffer_2,"LDIF parse error: too many attributes!:\n ");
buffer_puts(buffer_2,attributes.strings.root+name);
buffer_puts(buffer_2,"\nat dn\n ");
buffer_puts(buffer_2,(*l)->dn+stringtable.root);
buffer_putnlflush(buffer_2);
exit(1);
}
}
@@ -129,8 +142,7 @@ lookagain:
if (!stralloc_catb(&payload,buf,n)) goto nomem;
goto lookagain;
} else if (c=='\n') {
struct ldaprec* m=malloc(sizeof(struct ldaprec));
if (!m) return 2;
struct ldaprec* m;
if (!stralloc_0(&payload)) goto nomem;
if (base64) {
@@ -155,11 +167,28 @@ lookagain:
if ((val=mstorage_add_bin(&stringtable,payload.s,len))<0) goto nomem;
addattribute(l,tmp,val);
m=0;
if (ldif_parse_callback) {
switch (ldif_parse_callback(*l)) {
case -1:
return -1;
case 0:
m=*l;
break;
#if 0
case 1:
m=0;
break;
#endif
}
}
if (!m) if (!(m=malloc(sizeof(struct ldaprec)))) return 2;
(*l)->next=m;
m->n=0; m->dn=-1; m->next=0;
ofs=0;
// dumprec(*l);
l=&((*l)->next);
if (*l!=m) l=&((*l)->next);
++ldifrecords;
continue;
} else {
@@ -194,7 +223,8 @@ lookagain:
addattribute(l,tmp,val);
#endif
} while (!eof);
if ((*l)->dn<0) {
if (ldif_parse_callback && ldif_parse_callback(*l)==-1) return -1;
if ((*l)->dn<0 && ((*l)->next)) {
struct ldaprec* m=(*l)->next;
free((*l));
(*l)=m;

View File

@@ -9,8 +9,10 @@
typedef struct mduptable {
mstorage_t table,strings;
mstorage_t* Strings;
} mduptab_t;
void mduptab_init(mduptab_t* t);
void mduptab_init_reuse(mduptab_t* t,mstorage_t* s);
long mduptab_add(mduptab_t* t,const char* s,unsigned int len);
long mduptab_adds(mduptab_t* t,const char* s);

View File

@@ -10,13 +10,13 @@ long mduptab_add(mduptab_t* t,const char* s,unsigned int len) {
unsigned int i;
unsigned long* l=(unsigned long*)t->table.root;
long x,bak;
for (i=0; i<t->strings.used/sizeof(unsigned long); ++i)
if (bstr_equal2(t->strings.root+l[i],s,len))
for (i=0; i<t->table.used/sizeof(unsigned long); ++i)
if (bstr_equal2(t->Strings->root+l[i],s,len))
return l[i];
bak=t->strings.used;
if ((x=mstorage_add_bin(&t->strings,s,len))<0) return -1;
bak=t->Strings->used;
if ((x=mstorage_add_bin(t->Strings,s,len))<0) return -1;
if (mstorage_add(&t->table,(const char*)&x,sizeof(x))<0) {
t->strings.used=bak;
t->Strings->used=bak;
return -1;
}
return x;

View File

@@ -3,4 +3,5 @@
void mduptab_init(mduptab_t* t) {
mstorage_init(&t->table);
mstorage_init(&t->strings);
t->Strings=&t->strings;
}

6
mduptab_init_reuse.c Normal file
View File

@@ -0,0 +1,6 @@
#include "mduptab.h"
void mduptab_init_reuse(mduptab_t* t,mstorage_t* s) {
mstorage_init(&t->table);
t->Strings=s;
}

View File

@@ -15,7 +15,7 @@ int mstorage_init_persistent(mstorage_t* p,int fd);
/* Works like strstorage_add, but will return an
* offset to mstorage_root, which is mmapped and may thus change. */
/* negative offset == error */
/* offset -1 ==> error */
long mstorage_add(mstorage_t* p,const char* s,unsigned long n);
/* undo mapping */

195
parse.c
View File

@@ -1,3 +1,5 @@
/* This is just the main() for "parse". The actual parser is in
* ldif_parse.c */
#include <alloca.h>
#include <inttypes.h>
#include <unistd.h>
@@ -11,16 +13,44 @@
#include "uint32.h"
#include "byte.h"
/* these are defined in ldif_parse.c.
* We extern them here so we can initialize them.
* This was not necessary until I reworked mstorage_t to support
* persistence via a file descriptor, which needs to be -1 and not 0 if
* unused. */
extern mduptab_t attributes,classes;
/* we do a minor optimization by saving the strings of names of
* attributes and objectClass values only once. mduptab_t is the data
* structure used for this, see mduptab.h */
extern mstorage_t stringtable;
/* this is a giant string table where all the strings (keys and
* values) of the data are written to. This is actually the memory
* mapped destination file. */
extern int (*ldif_parse_callback)(struct ldaprec* l);
/* ldif_parse.c contains the actual ldif parser. It reads from a
* buffer (see libowfat, buffer.h) and creates a linked list of
* entries. This is unnecessarily wasteful, so I added the above
* callback, which is called after each record. If the callback
* is non-NULL and returns 1 when called with the record the parser
* just read in, the parser will assume the record has been stored
* somewhere else and not create a linked list but overwrite the same
* record in memory. This saves space and overhead. If we need to
* work on even larger files, this could even be reworked to be a
* persistent mmapped temp file. */
/* parse exp.ldif and write binary representation to "data".
* please read "FORMAT" for a description of the file format */
/* please note that tinyldap separates the data and the index although
* they are in the same file. This program only creates the binary
* representation, the actual indices are created by addindex. */
#ifndef PAGE_SIZE
#define PAGE_SIZE 4096
#endif
/* for debugging and error messages */
/* ldaprec is the struct used by ldif_parse.c */
void dumprec(struct ldaprec* l) {
int i;
if (l->dn>=0) {
@@ -30,10 +60,10 @@ void dumprec(struct ldaprec* l) {
} else
buffer_puts(buffer_1,"no dn?!\n");
for (i=0; i<l->n; ++i) {
buffer_puts(buffer_1,attributes.strings.root+l->a[i].name);
buffer_puts(buffer_1,attributes.Strings->root+l->a[i].name);
buffer_puts(buffer_1,": ");
if (l->a[i].name==objectClass)
buffer_puts(buffer_1,classes.strings.root+l->a[i].value);
buffer_puts(buffer_1,classes.Strings->root+l->a[i].value);
else
buffer_puts(buffer_1,stringtable.root+l->a[i].value);
buffer_puts(buffer_1,"\n");
@@ -41,17 +71,70 @@ void dumprec(struct ldaprec* l) {
buffer_putsflush(buffer_1,"\n");
}
extern mstorage_t stringtable;
extern mduptab_t attributes,classes;
/* Records are stored with a variable length externally, see FORMAT.
* We need to store the records and a table of the offsets of the
* records inside the data file in the data file. These data structures
* hold this data: */
mstorage_t record_offsets;
mstorage_t records;
unsigned long offset_classes,record_count;
/* record_count is just a convenience, the same value is also visible
* as record_offsets.used/4 */
int ldif_callback(struct ldaprec* l) {
char x[8]; /* temp buf for endianness conversion */
int i;
uint32 ofs;
uint32 oc; /* value of the first objectClass */
int found;
if (!l->n) return 0;
found=0;
for (i=0; i<l->n; ++i) {
if (l->a[i].name==objectClass) {
oc=l->a[i].value;
l->a[i].value=-1;
found=1;
break;
}
}
if (!found) {
buffer_putsflush(buffer_1,"ignoring record without objectClass...\n");
dumprec(l);
return 0;
}
uint32_pack(x,l->n+1);
uint32_pack(x+4,0);
if ((ofs=mstorage_add(&records,x,8))==(uint32)-1) return -1;
uint32_pack(x,l->dn);
uint32_pack(x+4,oc);
if (mstorage_add(&records,x,8)==-1) return -1;
for (i=0; i<l->n; ++i) {
if (l->a[i].name==objectClass && l->a[i].value==-1) continue;
uint32_pack(x,l->a[i].name);
uint32_pack(x+4,l->a[i].value);
if (mstorage_add(&records,x,8)==-1) return -1;
}
uint32_pack(x,ofs);
if (mstorage_add(&record_offsets,x,4)==-1) return -1;
++record_count;
return 0;
}
int main(int argc,char* argv[]) {
int fd;
long len;
char* destname=argc<3?"data":argv[2];
unsigned long size_of_string_table,indices_offset,record_count;
long offset_stringtable,offset_classes,offset_attributes;
unsigned long size_of_string_table,indices_offset;
long offset_stringtable;
char* map,* dest;
mstorage_init(&record_offsets);
mstorage_init(&records);
ldif_parse_callback=ldif_callback;
if ((fd=open(destname,O_RDWR|O_CREAT|O_TRUNC,0600))<0) {
buffer_puts(buffer_2,"could not create destination data file ");
derrout:
@@ -65,8 +148,8 @@ derrout:
buffer_puts(buffer_2,"mstorage_init_persistent: error mmapping ");
goto derrout;
}
mduptab_init(&attributes);
mduptab_init(&classes);
mduptab_init_reuse(&attributes,&stringtable);
mduptab_init_reuse(&classes,&stringtable);
{
char dummy[5*4];
@@ -79,7 +162,7 @@ derrout:
return 1;
}
size_of_string_table=stringtable.used+classes.strings.used+attributes.strings.used-5*4;
size_of_string_table=stringtable.used-5*4;
size_of_string_table=(size_of_string_table+3)&-4; /* round up to 32 bits */
/* first find out how much space we need */
len = 5*sizeof(uint32_t); /* magic plus four counts */
@@ -88,37 +171,8 @@ derrout:
// fdprintf(2,"offsets of records: %lu\n",len);
/* now for the hard part: the records */
{
struct ldaprec* x=first;
record_count=0;
while (x) {
int oc=0,i;
// long old=len;
/* we add 8 for the <length-in-uint32_t,0> pair and we substract 8
* for the two saved pointers ("dn" and "objectClass") */
if (x->dn>=0) len+=8; else {
if (x->n==0 && x->next==0) break;
buffer_putsflush(buffer_2,"record without dn?!\n");
dumprec(x);
return 1;
}
for (i=0; i<x->n; ++i) {
len+=8;
if (x->a[i].name==objectClass) oc=1;
}
if (!oc) {
buffer_puts(buffer_2,"record \"");
buffer_puts(buffer_2,x->dn+stringtable.root);
buffer_putsflush(buffer_2,"\" has no objectClass?!\n");
dumprec(x);
return 1;
}
++record_count;
// fdprintf(2,"considering record \"%s\": length %d\n",x->dn+stringtable.root,len-old);
x=x->next;
}
}
len += records.used;
// fdprintf(2,"offsets of indices: %lu\n",len);
indices_offset=len;
len+=record_count*4;
@@ -139,66 +193,31 @@ derrout:
// size_of_string_table=stringtable.used+classes.strings.used+attributes.strings.used;
offset_stringtable=5*4;
offset_classes= /* offset_stringtable+ */ stringtable.used;
offset_attributes=offset_classes+classes.strings.used;
// byte_copy(map+offset_stringtable,stringtable.used,stringtable.root);
byte_copy(map+offset_classes,classes.strings.used,classes.strings.root);
byte_copy(map+offset_attributes,attributes.strings.used,attributes.strings.root);
// fdprintf(2,"offset_classes=%lu, offset_attributes=%lu, attributes=%lu\n",
// offset_classes,offset_attributes,attributes.strings.used);
offset_classes=stringtable.used;
dest=map+offset_stringtable+size_of_string_table;
{
unsigned long i;
for (i=0; i<attributes.table.used/sizeof(long); ++i) {
#if 0
fdprintf(2,"writing at %x: attribute %lu (%s)\n",dest+i-map,
((long*)attributes.table.root)[i],attributes.strings.root+((long*)attributes.table.root)[i]);
#endif
uint32_pack(dest+i*4,((long*)attributes.table.root)[i]+offset_attributes);
uint32_pack(dest+i*4,((long*)attributes.table.root)[i]);
}
i=attributes.table.used/sizeof(long)*4;
dest+=i;
byte_zero(dest,i);
dest+=i;
}
// fdprintf(2,"actual offset before records: %lu\n",dest-map);
/* now the records */
{
struct ldaprec* x=first;
uint32_t* record_offsets=alloca(4*record_count);
uint32_t cur=0;
while (x) {
int i=x->n+1;
record_offsets[cur]=dest-map; ++cur;
uint32_pack(dest,i); uint32_pack(dest+4,0); dest+=8;
uint32_pack(dest,x->dn /* +offset_stringtable */);
for (i=0; i<x->n; ++i) {
if (x->a[i].name==objectClass) {
uint32_pack(dest+4,x->a[i].value+offset_classes);
x->a[i].name=-1;
break;
}
}
dest+=8;
for (i=0; i<x->n; ++i) {
if (x->a[i].name>=0) {
uint32_pack(dest,x->a[i].name+offset_attributes);
if (x->a[i].name==objectClass)
uint32_pack(dest+4,x->a[i].value+offset_classes);
else
uint32_pack(dest+4,x->a[i].value /* +offset_stringtable */);
dest+=8;
}
}
x=x->next;
}
// fdprintf(2,"actual offset of record_index: %lu\n",dest-map);
/* now the record_index */
for (cur=0; cur<record_count; ++cur) {
uint32_pack(dest,record_offsets[cur]);
dest+=4;
}
char* x;
unsigned long i;
uint32 addme=dest-map;
byte_copy(dest,records.used,records.root);
x=record_offsets.root;
dest+=records.used;
for (i=0; i<record_count; ++i)
uint32_pack(dest+4*i,uint32_read(x+4*i)+addme);
}
munmap(map,len);
close(fd);
return 0;