/*XXX Note to make this thread safe, need to mutex re_comp and re_exec */ #include #include #include #include #include /* For malloc and free */ #define _toupper(c) ((c)-'a'+'A') #ifdef MMAP #include #endif /* Archie definitions */ #include #include #include #include #include #include "prarch.h" #include #include #include #include VLINK atoplink(); VLINK atoqlink(); char *re_comp(); char *make_lcase(); int get_match_list(); extern char *strings_begin; extern long strings_table_size; extern DBM *fast_strings; /* So we can adjust our cache policy based on queue length */ extern int pQlen; static char lowertable[256] = { '\000','\001','\002','\003','\004','\005','\006','\007', '\010','\011','\012','\013','\014','\015','\016','\017', '\020','\021','\022','\023','\024','\025','\026','\027', '\030','\031','\032','\033','\034','\035','\036','\037', ' ','!','"','#','$','%','&','\'', '(',')','*','+',',','-','.','/', '0','1','2','3','4','5','6','7', '8','9',':',';','<','=','>','?', '@','a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', 'p','q','r','s','t','u','v','w', 'x','y','z','[','\\',']','^','_', '`','a','b','c','d','e','f','g', 'h','i','j','k','l','m','n','o', 'p','q','r','s','t','u','v','w', 'x','y','z','{','|','}','~','\177', '\200','\201','\202','\203','\204','\205','\206','\207', '\210','\211','\212','\213','\214','\215','\216','\217', '\220','\221','\222','\223','\224','\225','\226','\227', '\230','\231','\232','\233','\234','\235','\236','\237', '\240','\241','\242','\243','\244','\245','\246','\247', '\250','\251','\252','\253','\254','\255','\256','\257', '\260','\261','\262','\263','\264','\265','\266','\267', '\270','\271','\272','\273','\274','\275','\276','\277', '\300','\301','\302','\303','\304','\305','\306','\307', '\310','\311','\312','\313','\314','\315','\316','\317', '\320','\321','\322','\323','\324','\325','\326','\327', '\330','\331','\332','\333','\334','\335','\336','\337', '\340','\341','\342','\343','\344','\345','\346','\347', '\350','\351','\352','\353','\354','\355','\356','\357', '\360','\361','\362','\363','\364','\365','\366','\367', '\370','\371','\372','\373','\374','\375','\376','\377'}; #define MATCH_CACHE_SIZE 15 struct match_cache { char *arg; /* Matched regular expression */ int max_hits; /* Maximum matchess <0 = found all */ int offset; /* Offset */ search_sel search_type; /* Search method (the one used) */ search_sel req_type; /* Requested method */ VLINK matches; /* Matches */ VLINK more; /* Additional matches */ int flags; /* Flags: for link attributes */ struct match_cache *next; /* Next entry in cache */ }; static struct match_cache *mcache = NULL; static int cachecount = 0; /* * prarch_match - Search archie database for specified file * * PRARCH_MATCH searches the archie database and returns * a list of files matching the provided regular expression * * ARGS: program_name - regular expression for files to match * max_hits - maximum number of entries to return (max hits) * offset - start the search after this many hits * search_type - search method * vd - pointer to directory to be filled in * archiedir - flag - directory links should be to archie * onlystrings - flag - only return strings, not matches * * Search method is one of: S_FULL_REGEX * S_EXACT * S_SUB_NCASE_STR * S_SUB_CASE_STR */ int prarch_match(char *program_name, /* Regular expression to be matched */ int max_hits, /* Maximum number of entries to rtrn */ int max_match, /* Maximum number of unique strings */ int max_hitspm, /* Maximum hits per match */ int offset, /* Skip # matches before starting */ search_sel search_type, /* Search method */ VDIR vd, /* Directory to be filled in */ int flags, /* Flag for link attributes */ int onlystrings) /* Only return matching strings */ { /* * Search the database for the string specified by 'program_name'. Use the * fast dbm strings database if 'is_exact' is set, otherwise search through * the strings table. Stop searching after all matches have been found, or * 'max_hits' matches have been found, whichever comes first. */ char s_string[MAX_STRING_LEN]; char *strings_ptr; char *strings_curr_off; strings_header str_head; datum search_key, key_value; search_sel new_search_type = S_EXACT; /* Alternate search method */ search_sel or_search_type = search_type; /* Original search method */ int nocase = 0; int hits_exceeded = FALSE; /* should be boolean? */ char *strings_end; int match_number; int patlen; site_out **site_outptr; site_out site_outrec; int i; VLINK cur_link; int loopcount = 0; int retval; int match_rem = max_match; if(!program_name || !(*program_name)) return(PRARCH_BAD_ARG); if((0 < max_hits) && (max_hits < match_rem)) match_rem = max_hits; if((0 < max_hits) && (max_hits < max_hitspm)) max_hitspm = max_hits; strcpy(s_string, program_name); /* See if we can use a less expensive search method */ if((search_type == S_FULL_REGEX) || (search_type == S_E_FULL_REGEX)) { /* Regex search assumes wildcards on both ends, so remove from string */ if(strncmp(program_name,".*",2) == 0) strcpy(s_string, program_name+2); if((i = strlen(s_string)) >= 2) { if(strcmp(s_string+i-2,".*") == 0) *(s_string+i-2) = '\0'; } /* If no special characters, then fall back to substring search */ if((search_type == S_FULL_REGEX) && (strpbrk(s_string,"\\^$.,[]<>*+?|(){}/") == NULL)) or_search_type = search_type = S_SUB_CASE_STR; else if((search_type == S_E_FULL_REGEX) && (strpbrk(s_string,"\\^$.,[]<>*+?|(){}/") == NULL)) or_search_type = search_type = S_E_SUB_CASE_STR; } /* The caching code assumes we are handed an empty directory */ /* if not, return an error for now. Eventually we will get */ /* rid of that assumption */ if(vd->links) { plog(L_DIR_ERR, NOREQ, "Prarch_match handed non empty dir",0); return(PRARCH_BAD_ARG); } if(!onlystrings && (check_cache(s_string,max_hits,offset,search_type, flags,&(vd->links)) == TRUE)) { plog(L_DB_INFO, NOREQ, "Responding with cached data",0); return(PSUCCESS); } site_outptr = (site_out **) malloc((unsigned)(sizeof(site_out) * (max_hits + offset))); if(!site_outptr) return(PRARCH_OUT_OF_MEMORY); startsearch: strings_ptr = strings_begin; strings_end = strings_begin + (int) strings_table_size; match_number = 0; switch(search_type){ case S_E_SUB_CASE_STR: new_search_type = S_SUB_CASE_STR; goto exact_match; case S_E_SUB_NCASE_STR: new_search_type = S_SUB_NCASE_STR; goto exact_match; case S_E_FULL_REGEX: new_search_type = S_FULL_REGEX; exact_match: case S_EXACT: search_key.dptr = s_string; search_key.dsize = strlen(s_string) + 1; ardp_accept(); key_value = dbm_fetch(fast_strings, search_key) ; if(key_value.dptr != (char *)NULL){ /* string in table */ int string_pos; bcopy(key_value.dptr,(char *)&string_pos, key_value.dsize); strings_ptr += string_pos; bcopy(strings_ptr,(char *)&str_head,sizeof(strings_header)); ardp_accept(); if(onlystrings) { cur_link = atoqlink(strings_ptr,max_hits,max_match,max_hitspm); if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT); if(--match_rem <= 0) { hits_exceeded = TRUE; break; } } else if(str_head.filet_index != -1) { retval = get_match_list((int) str_head.filet_index, max_hitspm, &match_number, site_outptr, FALSE); if((retval != A_OK) && (retval != HITS_EXCEEDED)) { plog(L_DB_ERROR, NOREQ,"get_match_list failed (%d)",retval,0); goto cleanup; } if( match_number >= max_hits + offset ){ hits_exceeded = TRUE; break; } } } else if (search_type != S_EXACT) { /* Not found - but try other method */ search_type = new_search_type; goto startsearch; } break; case S_FULL_REGEX: if(re_comp(s_string) != (char *)NULL){ return (PRARCH_BAD_REGEX); } str_head.str_len = -1; ardp_accept(); while((strings_curr_off = strings_ptr + str_head.str_len + 1) < strings_end){ if((loopcount++ & 0x7ff) == 0) ardp_accept(); strings_ptr = strings_curr_off; bcopy(strings_ptr,(char *)&str_head,sizeof(strings_header)); strings_ptr += sizeof(strings_header); if(re_exec( strings_ptr ) == 1 ){ /* TRUE */ strings_curr_off = strings_ptr; ardp_accept(); if(onlystrings) { if(strstr(strings_ptr," -> ") == NULL) { /* No broken strings */ cur_link = atoqlink(strings_ptr,max_hits,max_match,max_hitspm); if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT); if(--match_rem <= 0) { hits_exceeded = TRUE; break; } } } else if(str_head.filet_index != -1){ retval = get_match_list((int) str_head.filet_index, max_hitspm, &match_number, site_outptr, FALSE); if((retval != A_OK) && (retval != HITS_EXCEEDED)) { plog(L_DB_ERROR, NOREQ,"get_match_list failed (%d)",retval,0); goto cleanup; } if( match_number >= max_hits + offset ){ hits_exceeded = TRUE; break; } } } } break; #define TABLESIZE 256 case S_SUB_NCASE_STR: nocase++; case S_SUB_CASE_STR: { char pattern[MAX_STRING_LEN]; int skiptab[TABLESIZE]; register int pc, tc; register int local_loopcount = 0xfff; char *bp1; int skip; int plen; int plen_1; int tlen; unsigned char tchar; plen = strlen(s_string); plen_1 = plen -1; /* Old code (replaced by inline code taken from initskip) */ /* patlen = strlen(s_string ) ; */ /* initskip(s_string, patlen, search_type == S_SUB_NCASE_STR) ; */ if(nocase) { for(pc = 0; s_string[pc]; pc++) pattern[pc] = lowertable[s_string[pc]]; pattern[pc] = '\0'; } else strcpy(pattern,s_string); for( i = 0 ; i < TABLESIZE ; i++ ) skiptab[ i ] = plen; /* Note that we want both ucase and lcase in this table if nocase */ for( i = 0, tchar = *pattern; i < plen ; i++, tchar = *(pattern + i)) { skiptab[tchar] = plen - 1 - i; if(nocase && islower(tchar)) skiptab[_toupper(tchar)] = plen - 1 - i; } /* Begin heavily optimized and non portable code */ /* Note that we are depending on str_head being 8 bytes */ tlen = -9; /* str_head.str_len */ strings_curr_off = strings_ptr; while((strings_curr_off += tlen + 9) < strings_end) { if(--local_loopcount == 0) { ardp_accept(); local_loopcount = 0xfff; } strings_ptr = strings_curr_off; /* This is a kludge, non-portable, but it eliminates a pr call */ /* Note that the size is 8 on suns. Is there a better way? */ /* bcopy(strings_ptr,(char *)&str_head,sizeof(strings_header)); */ bp1 = (char *) &str_head; /* The copying of the file index is done only on a match */ bp1[4] = strings_ptr[4]; bp1[5] = strings_ptr[5]; /* bp1[6] = strings_ptr[6]; bp1[7] = strings_ptr[7]; */ tlen = (unsigned short) str_head.str_len; /* To catch database corruption, this is a sanity check */ if((tlen < 0) || (tlen > MAX_STRING_LEN)) { plog(L_DB_ERROR, NOREQ,"Database corrupt: string length out of bounds",0); break; } /* Old code (replaced by inline code taken from strfind) */ /* if(strfind(strings_ptr,str_head.str_len)) */ if( tlen <= plen_1 ) continue; pc = tc = plen_1; strings_ptr += 8; /* Moved the nocase test outside the inner loop for performace */ /* Clauses are identical except for the first if */ if(nocase) do { tchar = strings_ptr[tc]; /* improve efficiency of this test */ if(lowertable[tchar] == pattern[pc]) {--pc; --tc;} else { skip = skiptab[tchar] ; tc += (skip < plen_1 - pc) ? plen : skip ; pc = plen_1 ; } } while( pc >= 0 && tc < tlen ) ; else /* (!nocase) */ do { tchar = strings_ptr[tc]; /* improve efficiency of this test */ if(tchar == pattern[pc]) {--pc; --tc;} else { skip = skiptab[tchar] ; tc += (skip < plen_1 - pc) ? plen : skip ; pc = plen_1 ; } } while( pc >= 0 && tc < tlen ) ; if(pc >= 0) continue; /* We have a match */ /* Finish copying str_head - strings_curr_off */ /* is old strings_ptr. */ bp1[0] = strings_curr_off[0]; bp1[1] = strings_curr_off[1]; bp1[2] = strings_curr_off[2]; bp1[3] = strings_curr_off[3]; /* End heavily optimized and non portable code */ ardp_accept(); if(onlystrings) { if(strstr(strings_ptr," -> ") == NULL) { /* No broken strings */ cur_link = atoqlink(strings_ptr,max_hits,max_match,max_hitspm); if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT); if(--match_rem <= 0) { hits_exceeded = TRUE; break; } } } else if(str_head.filet_index != -1){ retval = get_match_list((int) str_head.filet_index, max_hitspm, &match_number, site_outptr, FALSE); if((retval != A_OK) && (retval != HITS_EXCEEDED)) { plog(L_DB_ERROR,NOREQ,"get_match_list failed (%d)",retval,0); goto cleanup; } if( match_number >= max_hits + offset ) { hits_exceeded = TRUE; break; } } } } break; default: return(PRARCH_BAD_ARG); cleanup: for(i = 0;i < match_number; i++) free((char *)site_outptr[i]); free((char *)site_outptr); return(PRARCH_DB_ERROR); } for(i = 0;i < match_number; i++){ if((i & 0x7f) == 0) ardp_accept(); site_outrec = *site_outptr[i]; if(i >= offset) { cur_link = atoplink(site_outrec,flags); if(cur_link) vl_insert(cur_link,vd,VLI_NOSORT); } free((char *)site_outptr[i]); } free((char *)site_outptr); if(hits_exceeded) { /* Insert a continuation entry */ } if((search_type == S_EXACT) && (pQlen > (MATCH_CACHE_SIZE - 5))) return(PRARCH_SUCCESS); if(!onlystrings) add_to_cache(vd->links,s_string, (hits_exceeded ? max_hits : -max_hits), offset,search_type,or_search_type,flags); return(PRARCH_SUCCESS); } /* Check for cached results */ check_cache(arg,max_hits,offset,qtype,flags,linkpp) char *arg; int max_hits; int offset; search_sel qtype; int flags; VLINK *linkpp; { struct match_cache *cachep = mcache; struct match_cache *pcachep = NULL; VLINK tmp_link, cur_link; VLINK rest = NULL; VLINK next = NULL; int count = max_hits; while(cachep) { if(((qtype == cachep->search_type)||(qtype == cachep->req_type))&& (cachep->offset == offset) && /* All results are in cache - or enough to satisfy request */ ((cachep->max_hits < 0) || (max_hits <= cachep->max_hits)) && (strcmp(cachep->arg,arg) == 0) && (cachep->flags == flags)) { /* We have a match. Move to front of list */ if(pcachep) { pcachep->next = cachep->next; cachep->next = mcache; mcache = cachep; } /* We now have to clear the expanded bits or the links */ /* returned in previous queries will not be returned */ /* We also need to truncate the list of there are more */ /* matches than requested */ cur_link = cachep->matches; /* IMPORTANT: This code assumes the list is one */ /* dimensional, which is the case because we called */ /* vl_insert with the VLI_NOSORT option */ while(cur_link) { cur_link->expanded = FALSE; if((--count == 0) && cur_link->next) { /* truncate list */ if(cachep->more) { cur_link->next->previous = cachep->more->previous; cachep->more->previous = cachep->matches->previous; cachep->matches->previous->next = cachep->more; } else { cachep->more = cur_link->next; cachep->more->previous = cachep->matches->previous; } cur_link->next = NULL; cachep->matches->previous = cur_link; } else if ((cur_link->next == NULL) && (count != 0) && cachep->more) { /* Merge lists */ cachep->matches->previous = cachep->more->previous; cur_link->next = cachep->more; cachep->more->previous = cur_link; cachep->more = NULL; } cur_link = cur_link->next; } *linkpp = cachep->matches; return(TRUE); } pcachep = cachep; cachep = cachep->next; } *linkpp = NULL; return(FALSE); } /* Cache the response for later use */ add_to_cache(vl,arg,max_hits,offset,search_type,req_type,flags) VLINK vl; char *arg; int max_hits; int offset; search_sel search_type; search_sel req_type; int flags; { struct match_cache *newresults = NULL; struct match_cache *pcachep = NULL; if(cachecount < MATCH_CACHE_SIZE) { /* Create a new entry */ newresults = (struct match_cache *) malloc(sizeof(struct match_cache)); cachecount++; newresults->next = mcache; mcache = newresults; newresults->arg = stcopy(arg); newresults->max_hits = max_hits; newresults->offset = offset; newresults->search_type = search_type; newresults->req_type = req_type; newresults->flags = flags; newresults->matches = NULL; newresults->more = NULL; } else { /* Use last entry - Assumes list has at least two entries */ pcachep = mcache; while(pcachep->next) pcachep = pcachep->next; newresults = pcachep; /* move to front of list */ newresults->next = mcache; mcache = newresults; /* Fix the last entry so we don't have a cycle */ while(pcachep->next != newresults) pcachep = pcachep->next; pcachep->next = NULL; /* Free the old results */ if(newresults->matches) { newresults->matches->dontfree = FALSE; vllfree(newresults->matches); newresults->matches = NULL; } if(newresults->more) { newresults->more->dontfree = FALSE; vllfree(newresults->more); newresults->more = NULL; } newresults->arg = stcopyr(arg,newresults->arg); newresults->max_hits = max_hits; newresults->offset = offset; newresults->search_type = search_type; newresults->req_type = req_type; newresults->flags = flags; } /* Since we are caching the data. If there are any links, */ /* note that they should not be freed when sent back */ if(vl) vl->dontfree = TRUE; newresults->matches = vl; }