archie/prospero/lib/psrv/archie2/prarch_match.c
2024-05-27 16:13:40 +02:00

638 lines
19 KiB
C

/*XXX Note to make this thread safe, need to mutex re_comp and re_exec */
#include <stdio.h>
#include <sys/types.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h> /* For malloc and free */
#define _toupper(c) ((c)-'a'+'A')
#ifdef MMAP
#include <sys/mman.h>
#endif
/* Archie definitions */
#include <ndbm.h>
#include <defines.h>
#include <structs.h>
#include <database.h>
#include <error.h>
#include "prarch.h"
#include <ardp.h>
#include <pfs.h>
#include <perrno.h>
#include <plog.h>
VLINK atoplink();
VLINK atoqlink();
char *re_comp();
char *make_lcase();
int get_match_list();
extern char *strings_begin;
extern long strings_table_size;
extern DBM *fast_strings;
/* So we can adjust our cache policy based on queue length */
extern int pQlen;
static char lowertable[256] = {
'\000','\001','\002','\003','\004','\005','\006','\007',
'\010','\011','\012','\013','\014','\015','\016','\017',
'\020','\021','\022','\023','\024','\025','\026','\027',
'\030','\031','\032','\033','\034','\035','\036','\037',
' ','!','"','#','$','%','&','\'',
'(',')','*','+',',','-','.','/',
'0','1','2','3','4','5','6','7',
'8','9',':',';','<','=','>','?',
'@','a','b','c','d','e','f','g',
'h','i','j','k','l','m','n','o',
'p','q','r','s','t','u','v','w',
'x','y','z','[','\\',']','^','_',
'`','a','b','c','d','e','f','g',
'h','i','j','k','l','m','n','o',
'p','q','r','s','t','u','v','w',
'x','y','z','{','|','}','~','\177',
'\200','\201','\202','\203','\204','\205','\206','\207',
'\210','\211','\212','\213','\214','\215','\216','\217',
'\220','\221','\222','\223','\224','\225','\226','\227',
'\230','\231','\232','\233','\234','\235','\236','\237',
'\240','\241','\242','\243','\244','\245','\246','\247',
'\250','\251','\252','\253','\254','\255','\256','\257',
'\260','\261','\262','\263','\264','\265','\266','\267',
'\270','\271','\272','\273','\274','\275','\276','\277',
'\300','\301','\302','\303','\304','\305','\306','\307',
'\310','\311','\312','\313','\314','\315','\316','\317',
'\320','\321','\322','\323','\324','\325','\326','\327',
'\330','\331','\332','\333','\334','\335','\336','\337',
'\340','\341','\342','\343','\344','\345','\346','\347',
'\350','\351','\352','\353','\354','\355','\356','\357',
'\360','\361','\362','\363','\364','\365','\366','\367',
'\370','\371','\372','\373','\374','\375','\376','\377'};
#define MATCH_CACHE_SIZE 15
struct match_cache {
char *arg; /* Matched regular expression */
int max_hits; /* Maximum matchess <0 = found all */
int offset; /* Offset */
search_sel search_type; /* Search method (the one used) */
search_sel req_type; /* Requested method */
VLINK matches; /* Matches */
VLINK more; /* Additional matches */
int flags; /* Flags: for link attributes */
struct match_cache *next; /* Next entry in cache */
};
static struct match_cache *mcache = NULL;
static int cachecount = 0;
/*
* prarch_match - Search archie database for specified file
*
* PRARCH_MATCH searches the archie database and returns
* a list of files matching the provided regular expression
*
* ARGS: program_name - regular expression for files to match
* max_hits - maximum number of entries to return (max hits)
* offset - start the search after this many hits
* search_type - search method
* vd - pointer to directory to be filled in
* archiedir - flag - directory links should be to archie
* onlystrings - flag - only return strings, not matches
*
* Search method is one of: S_FULL_REGEX
* S_EXACT
* S_SUB_NCASE_STR
* S_SUB_CASE_STR
*/
int prarch_match(char *program_name, /* Regular expression to be matched */
int max_hits, /* Maximum number of entries to rtrn */
int max_match, /* Maximum number of unique strings */
int max_hitspm, /* Maximum hits per match */
int offset, /* Skip # matches before starting */
search_sel search_type, /* Search method */
VDIR vd, /* Directory to be filled in */
int flags, /* Flag for link attributes */
int onlystrings) /* Only return matching strings */
{
/*
* Search the database for the string specified by 'program_name'. Use the
* fast dbm strings database if 'is_exact' is set, otherwise search through
* the strings table. Stop searching after all matches have been found, or
* 'max_hits' matches have been found, whichever comes first.
*/
char s_string[MAX_STRING_LEN];
char *strings_ptr;
char *strings_curr_off;
strings_header str_head;
datum search_key, key_value;
search_sel new_search_type = S_EXACT; /* Alternate search method */
search_sel or_search_type = search_type; /* Original search method */
int nocase = 0;
int hits_exceeded = FALSE; /* should be boolean? */
char *strings_end;
int match_number;
int patlen;
site_out **site_outptr;
site_out site_outrec;
int i;
VLINK cur_link;
int loopcount = 0;
int retval;
int match_rem = max_match;
if(!program_name || !(*program_name)) return(PRARCH_BAD_ARG);
if((0 < max_hits) && (max_hits < match_rem)) match_rem = max_hits;
if((0 < max_hits) && (max_hits < max_hitspm)) max_hitspm = max_hits;
strcpy(s_string, program_name);
/* See if we can use a less expensive search method */
if((search_type == S_FULL_REGEX) || (search_type == S_E_FULL_REGEX)) {
/* Regex search assumes wildcards on both ends, so remove from string */
if(strncmp(program_name,".*",2) == 0)
strcpy(s_string, program_name+2);
if((i = strlen(s_string)) >= 2) {
if(strcmp(s_string+i-2,".*") == 0)
*(s_string+i-2) = '\0';
}
/* If no special characters, then fall back to substring search */
if((search_type == S_FULL_REGEX) &&
(strpbrk(s_string,"\\^$.,[]<>*+?|(){}/") == NULL))
or_search_type = search_type = S_SUB_CASE_STR;
else if((search_type == S_E_FULL_REGEX) &&
(strpbrk(s_string,"\\^$.,[]<>*+?|(){}/") == NULL))
or_search_type = search_type = S_E_SUB_CASE_STR;
}
/* The caching code assumes we are handed an empty directory */
/* if not, return an error for now. Eventually we will get */
/* rid of that assumption */
if(vd->links) {
plog(L_DIR_ERR, NOREQ, "Prarch_match handed non empty dir",0);
return(PRARCH_BAD_ARG);
}
if(!onlystrings && (check_cache(s_string,max_hits,offset,search_type,
flags,&(vd->links)) == TRUE)) {
plog(L_DB_INFO, NOREQ, "Responding with cached data",0);
return(PSUCCESS);
}
site_outptr = (site_out **) malloc((unsigned)(sizeof(site_out) *
(max_hits + offset)));
if(!site_outptr) return(PRARCH_OUT_OF_MEMORY);
startsearch:
strings_ptr = strings_begin;
strings_end = strings_begin + (int) strings_table_size;
match_number = 0;
switch(search_type){
case S_E_SUB_CASE_STR:
new_search_type = S_SUB_CASE_STR;
goto exact_match;
case S_E_SUB_NCASE_STR:
new_search_type = S_SUB_NCASE_STR;
goto exact_match;
case S_E_FULL_REGEX:
new_search_type = S_FULL_REGEX;
exact_match:
case S_EXACT:
search_key.dptr = s_string;
search_key.dsize = strlen(s_string) + 1;
ardp_accept();
key_value = dbm_fetch(fast_strings, search_key) ;
if(key_value.dptr != (char *)NULL){ /* string in table */
int string_pos;
bcopy(key_value.dptr,(char *)&string_pos, key_value.dsize);
strings_ptr += string_pos;
bcopy(strings_ptr,(char *)&str_head,sizeof(strings_header));
ardp_accept();
if(onlystrings) {
cur_link = atoqlink(strings_ptr,max_hits,max_match,max_hitspm);
if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT);
if(--match_rem <= 0) {
hits_exceeded = TRUE;
break;
}
}
else if(str_head.filet_index != -1) {
retval = get_match_list((int) str_head.filet_index, max_hitspm,
&match_number, site_outptr, FALSE);
if((retval != A_OK) && (retval != HITS_EXCEEDED)) {
plog(L_DB_ERROR, NOREQ,"get_match_list failed (%d)",retval,0);
goto cleanup;
}
if( match_number >= max_hits + offset ){
hits_exceeded = TRUE;
break;
}
}
}
else if (search_type != S_EXACT) { /* Not found - but try other method */
search_type = new_search_type;
goto startsearch;
}
break;
case S_FULL_REGEX:
if(re_comp(s_string) != (char *)NULL){
return (PRARCH_BAD_REGEX);
}
str_head.str_len = -1;
ardp_accept();
while((strings_curr_off = strings_ptr + str_head.str_len + 1) < strings_end){
if((loopcount++ & 0x7ff) == 0) ardp_accept();
strings_ptr = strings_curr_off;
bcopy(strings_ptr,(char *)&str_head,sizeof(strings_header));
strings_ptr += sizeof(strings_header);
if(re_exec( strings_ptr ) == 1 ){ /* TRUE */
strings_curr_off = strings_ptr;
ardp_accept();
if(onlystrings) {
if(strstr(strings_ptr," -> ") == NULL) { /* No broken strings */
cur_link = atoqlink(strings_ptr,max_hits,max_match,max_hitspm);
if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT);
if(--match_rem <= 0) {
hits_exceeded = TRUE;
break;
}
}
}
else if(str_head.filet_index != -1){
retval = get_match_list((int) str_head.filet_index, max_hitspm,
&match_number, site_outptr, FALSE);
if((retval != A_OK) && (retval != HITS_EXCEEDED)) {
plog(L_DB_ERROR, NOREQ,"get_match_list failed (%d)",retval,0);
goto cleanup;
}
if( match_number >= max_hits + offset ){
hits_exceeded = TRUE;
break;
}
}
}
}
break;
#define TABLESIZE 256
case S_SUB_NCASE_STR:
nocase++;
case S_SUB_CASE_STR: {
char pattern[MAX_STRING_LEN];
int skiptab[TABLESIZE];
register int pc, tc;
register int local_loopcount = 0xfff;
char *bp1;
int skip;
int plen;
int plen_1;
int tlen;
unsigned char tchar;
plen = strlen(s_string);
plen_1 = plen -1;
/* Old code (replaced by inline code taken from initskip) */
/* patlen = strlen(s_string ) ; */
/* initskip(s_string, patlen, search_type == S_SUB_NCASE_STR) ; */
if(nocase) {
for(pc = 0; s_string[pc]; pc++)
pattern[pc] = lowertable[s_string[pc]];
pattern[pc] = '\0';
}
else strcpy(pattern,s_string);
for( i = 0 ; i < TABLESIZE ; i++ )
skiptab[ i ] = plen;
/* Note that we want both ucase and lcase in this table if nocase */
for( i = 0, tchar = *pattern; i < plen ; i++, tchar = *(pattern + i)) {
skiptab[tchar] = plen - 1 - i;
if(nocase && islower(tchar))
skiptab[_toupper(tchar)] = plen - 1 - i;
}
/* Begin heavily optimized and non portable code */
/* Note that we are depending on str_head being 8 bytes */
tlen = -9; /* str_head.str_len */
strings_curr_off = strings_ptr;
while((strings_curr_off += tlen + 9) < strings_end) {
if(--local_loopcount == 0) {
ardp_accept();
local_loopcount = 0xfff;
}
strings_ptr = strings_curr_off;
/* This is a kludge, non-portable, but it eliminates a pr call */
/* Note that the size is 8 on suns. Is there a better way? */
/* bcopy(strings_ptr,(char *)&str_head,sizeof(strings_header)); */
bp1 = (char *) &str_head;
/* The copying of the file index is done only on a match */
bp1[4] = strings_ptr[4]; bp1[5] = strings_ptr[5];
/* bp1[6] = strings_ptr[6]; bp1[7] = strings_ptr[7]; */
tlen = (unsigned short) str_head.str_len;
/* To catch database corruption, this is a sanity check */
if((tlen < 0) || (tlen > MAX_STRING_LEN)) {
plog(L_DB_ERROR, NOREQ,"Database corrupt: string length out of bounds",0);
break;
}
/* Old code (replaced by inline code taken from strfind) */
/* if(strfind(strings_ptr,str_head.str_len)) */
if( tlen <= plen_1 ) continue;
pc = tc = plen_1;
strings_ptr += 8;
/* Moved the nocase test outside the inner loop for performace */
/* Clauses are identical except for the first if */
if(nocase) do {
tchar = strings_ptr[tc];
/* improve efficiency of this test */
if(lowertable[tchar] == pattern[pc]) {--pc; --tc;}
else {
skip = skiptab[tchar] ;
tc += (skip < plen_1 - pc) ? plen : skip ;
pc = plen_1 ;
}
} while( pc >= 0 && tc < tlen ) ;
else /* (!nocase) */ do {
tchar = strings_ptr[tc];
/* improve efficiency of this test */
if(tchar == pattern[pc]) {--pc; --tc;}
else {
skip = skiptab[tchar] ;
tc += (skip < plen_1 - pc) ? plen : skip ;
pc = plen_1 ;
}
} while( pc >= 0 && tc < tlen ) ;
if(pc >= 0) continue;
/* We have a match */
/* Finish copying str_head - strings_curr_off */
/* is old strings_ptr. */
bp1[0] = strings_curr_off[0]; bp1[1] = strings_curr_off[1];
bp1[2] = strings_curr_off[2]; bp1[3] = strings_curr_off[3];
/* End heavily optimized and non portable code */
ardp_accept();
if(onlystrings) {
if(strstr(strings_ptr," -> ") == NULL) { /* No broken strings */
cur_link = atoqlink(strings_ptr,max_hits,max_match,max_hitspm);
if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT);
if(--match_rem <= 0) {
hits_exceeded = TRUE;
break;
}
}
}
else if(str_head.filet_index != -1){
retval = get_match_list((int) str_head.filet_index, max_hitspm,
&match_number, site_outptr, FALSE);
if((retval != A_OK) && (retval != HITS_EXCEEDED)) {
plog(L_DB_ERROR,NOREQ,"get_match_list failed (%d)",retval,0);
goto cleanup;
}
if( match_number >= max_hits + offset ) {
hits_exceeded = TRUE;
break;
}
}
}
}
break;
default:
return(PRARCH_BAD_ARG);
cleanup:
for(i = 0;i < match_number; i++) free((char *)site_outptr[i]);
free((char *)site_outptr);
return(PRARCH_DB_ERROR);
}
for(i = 0;i < match_number; i++){
if((i & 0x7f) == 0) ardp_accept();
site_outrec = *site_outptr[i];
if(i >= offset) {
cur_link = atoplink(site_outrec,flags);
if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT);
}
free((char *)site_outptr[i]);
}
free((char *)site_outptr);
if(hits_exceeded) {
/* Insert a continuation entry */
}
if((search_type == S_EXACT) && (pQlen > (MATCH_CACHE_SIZE - 5)))
return(PRARCH_SUCCESS);
if(!onlystrings)
add_to_cache(vd->links,s_string, (hits_exceeded ? max_hits : -max_hits),
offset,search_type,or_search_type,flags);
return(PRARCH_SUCCESS);
}
/* Check for cached results */
check_cache(arg,max_hits,offset,qtype,flags,linkpp)
char *arg;
int max_hits;
int offset;
search_sel qtype;
int flags;
VLINK *linkpp;
{
struct match_cache *cachep = mcache;
struct match_cache *pcachep = NULL;
VLINK tmp_link, cur_link;
VLINK rest = NULL;
VLINK next = NULL;
int count = max_hits;
while(cachep) {
if(((qtype == cachep->search_type)||(qtype == cachep->req_type))&&
(cachep->offset == offset) &&
/* All results are in cache - or enough to satisfy request */
((cachep->max_hits < 0) || (max_hits <= cachep->max_hits)) &&
(strcmp(cachep->arg,arg) == 0) &&
(cachep->flags == flags)) {
/* We have a match. Move to front of list */
if(pcachep) {
pcachep->next = cachep->next;
cachep->next = mcache;
mcache = cachep;
}
/* We now have to clear the expanded bits or the links */
/* returned in previous queries will not be returned */
/* We also need to truncate the list of there are more */
/* matches than requested */
cur_link = cachep->matches;
/* IMPORTANT: This code assumes the list is one */
/* dimensional, which is the case because we called */
/* vl_insert with the VLI_NOSORT option */
while(cur_link) {
cur_link->expanded = FALSE;
if((--count == 0) && cur_link->next) {
/* truncate list */
if(cachep->more) {
cur_link->next->previous = cachep->more->previous;
cachep->more->previous = cachep->matches->previous;
cachep->matches->previous->next = cachep->more;
}
else {
cachep->more = cur_link->next;
cachep->more->previous = cachep->matches->previous;
}
cur_link->next = NULL;
cachep->matches->previous = cur_link;
}
else if ((cur_link->next == NULL) && (count != 0) &&
cachep->more) {
/* Merge lists */
cachep->matches->previous = cachep->more->previous;
cur_link->next = cachep->more;
cachep->more->previous = cur_link;
cachep->more = NULL;
}
cur_link = cur_link->next;
}
*linkpp = cachep->matches;
return(TRUE);
}
pcachep = cachep;
cachep = cachep->next;
}
*linkpp = NULL;
return(FALSE);
}
/* Cache the response for later use */
add_to_cache(vl,arg,max_hits,offset,search_type,req_type,flags)
VLINK vl;
char *arg;
int max_hits;
int offset;
search_sel search_type;
search_sel req_type;
int flags;
{
struct match_cache *newresults = NULL;
struct match_cache *pcachep = NULL;
if(cachecount < MATCH_CACHE_SIZE) { /* Create a new entry */
newresults = (struct match_cache *) malloc(sizeof(struct match_cache));
cachecount++;
newresults->next = mcache;
mcache = newresults;
newresults->arg = stcopy(arg);
newresults->max_hits = max_hits;
newresults->offset = offset;
newresults->search_type = search_type;
newresults->req_type = req_type;
newresults->flags = flags;
newresults->matches = NULL;
newresults->more = NULL;
}
else { /* Use last entry - Assumes list has at least two entries */
pcachep = mcache;
while(pcachep->next) pcachep = pcachep->next;
newresults = pcachep;
/* move to front of list */
newresults->next = mcache;
mcache = newresults;
/* Fix the last entry so we don't have a cycle */
while(pcachep->next != newresults) pcachep = pcachep->next;
pcachep->next = NULL;
/* Free the old results */
if(newresults->matches) {
newresults->matches->dontfree = FALSE;
vllfree(newresults->matches);
newresults->matches = NULL;
}
if(newresults->more) {
newresults->more->dontfree = FALSE;
vllfree(newresults->more);
newresults->more = NULL;
}
newresults->arg = stcopyr(arg,newresults->arg);
newresults->max_hits = max_hits;
newresults->offset = offset;
newresults->search_type = search_type;
newresults->req_type = req_type;
newresults->flags = flags;
}
/* Since we are caching the data. If there are any links, */
/* note that they should not be freed when sent back */
if(vl) vl->dontfree = TRUE;
newresults->matches = vl;
}