/* index sequential files, producing .idx, .sel and .pos files */ /* Bruce Tanner - Cerritos College */ /* Version History: 1.0 05/10/93 Original attempt 2.0 06/20/93 Create indexed files directly, add keyword count field 2.1 07/08/93 Change the file name for NOISE_WORDS 2.2 07/08/93 Move the range end (end_pos) to before the terminator 2.2jlw 07/14/93 - JLW added length spec to dash, added additional topic divider keywords 2.3 07/19/93 Set multi-buffer, multi-block counts, read-ahead, write-behind and deferred write; noticeably increased performance 2.4 07/26/93 Removed index name, added CLI$ interface, added /TOPIC 2.4jlw 07/27/93 fixed version retention, which was broken 2.5 07/27/93 Selector strings forced to lowercase; use a good copy 2.6 07/29/93 revamp /TOPIC syntax to include text, size, exclude 2.7 07/30/93 make SIZE=n pad as well as truncate field width 2.8 08/03/93 take wildcard input file names, add /OUTPUT, /VERSION 2.9 08/05/93 JLW changed filename sizes from 80 to 256 characters 2.10 08/05/93 add check for max number of topics, reformat code 2.11 08/24/93 JLW added specific statuses for exit errors 2.12 10/01/93 add /NODEFAULT_TOPIC to omit topics that have no topic keyword 2.13 11/03/93 add /LINK to generate .link file instead of .idx/.sel 2.14 11/15/93 add /NOISE=file to specify the noise words file 2.15 11/17/93 add /TOPIC=(position), /FIELD=(position, size), /PUNCTUATION 2.16 11/18/93 fix illegal strcpy for AXP, add /MAX_TOPICS 2.17 11/21/93 make load_noise friendlier, add /NOPUNCTUATION support 2.18 11/27/93 add /MINIMUM_WORD, /COUNT_WORDS 2.19 11/30/93 fix broken /TOPIC 2.20 03/20/94 sort words, add /LINK=SORT, /SEQUENTIAL, remove /COUNT_WORDS 2.21 04/29/94 add /NONUMBERS 2.22 06/23/94 add /TOPIC=(offset) /TOPIC=(position=0) 2.23 06/24/94 add /TOPIC=(end) 2.24 06/27/94 add /CANDIDATE, /KEYWORD=(text,end,exclude) 2.24a 06/29/94 replaced VAXC-specific "#include foo" declarations with more portable "#include " (so DECC won't balk). 2.25 08/04/94 fix /TOPIC=END not matching 2.26 09/15/94 /KEYWORD=END=foo stopped at end of line if 'foo' wasn't found 2.27 09/27/94 change get_text() to return updated pointer to fix mangled text 3.0 09/29/94 redo parsing routines, add /SPECIFICATION, /TOPIC=BREAK 3.1 10/10/94 add /SELECTOR, don't index selector line 3.2 10/17/94 change /SELECTOR to /SELECTOR=(TEXT,END,BOTH) 3.3 11/04/94 add /KEYWORD=(offset), extend selector to include host/port 3.4 11/07/94 add /HELPFILE /SELECTOR=IGNORE 3.5 12/16/94 move close of link file for wildcards 3.6 01/02/95 program around selector.end default problem 3.7 03/18/95 add /TOPIC=LITERAL 3.8 04/28/95 fix problems with /LINK/WHOLE and /LINK/OUTPUT 4.0 05/16/95 add word position code 4.1 06/10/95 add /URL, convert selector file items to URLs 4.2 06/25/95 add /NOPOSITION, Joel Snyder's /COUNT 4.3 11/12/95 merge partial /URL and existing file name. 4.4 11/27/95 change format of selector 4.5 01/01/96 handle case where topic.text is used by another qualifier 4.6 01/02/96 add /EXCLUDE, /QUIET 4.7 01/04/96 change default to /NOVERSION, fix url.default and /HELP bugs 01/07/96 add /URL=PREFIX, /URL=BOTH 01/11/96 changed _tolower() to tolower() for GNU C 01/12/96 add /TOPIC=FIRST 01/14/96 add Arne Vajhøj's international toupper/tolower code 4.8 03/08/96 add Dave Smith's fixes for Gopher selectors 4.9 04/21/96 add /TOPIC=FILE=FULL, fix /TOPIC=BREAK 4.10 05/04/96 detect word index field overflow. Add /TOPIC=DEFAULT 4.11 07/08/96 remove the restriction that a line can only satisfy one /TOPIC 4.12 09/22/96 add Malcolm Dunnett's wildcard read error processing code 11/01/96 add /URL=FRAGMENT 9/6/00 RDP use dummy file pointer instead of lnk for first call of write_words, where the lnk pointer is undefined, but not used because of the switch values in the if statements, to satisfy DEC C compiler. */ #define VERSION "4.12 11/01/96" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "intctype.h" #define CHUNK 100 /* increment to expand table of words */ #define DESC_SIZE 500 /* maximum size of a topic description */ #define SELECTOR_SIZE 200 /* maximum size of a selector (minus description) */ #define TOPIC_SIZE 20 /* maximum number of topics to list */ #define PUNCT_CHARS "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~" #define DEFAULT_POS 0 /* column to look for topic by default; 0 = anywhere */ #define MAX_INPUT_LINE 500 #define MAGIC_COOKIE "MAX#INDEX" typedef struct { char *literal; char *text; char *found; char *end; char *deftext; int pos; int size; int offset; int exclude; int force_break; int used; int first; int file; } topic_str; typedef struct { char *name; int state; int multiple; /* 0 = no list, 1 = unread list member, 2 = already read */ char *value; } switch_str; typedef struct { char *str; int pos; } table_str; typedef struct dsc$descriptor_s dsc; typedef enum {para, dash, hex, equal, line, whole, field, force} sep_type; int words_index, words_size; table_str *words, *noise, *candidate; int noise_index, noise_size; int candidate_index, candidate_size; int max_word, max_topic, max_count, max_wid, max_pos, sequential; char *idx_record, *idx_key, *prev_keyword, *pos_record; char sel_record[DESC_SIZE + SELECTOR_SIZE + 100]; int keyword_index; FILE *spc; switch_str switches[100]; topic_str topics[TOPIC_SIZE]; topic_str keywords[TOPIC_SIZE]; char *selector_spec, *url_spec, *url_fragment; int word_index = 1; int db_index = 0; int cli$dcl_parse(); int cli$get_value(); int cli$present(); void find_eof(struct RAB *); void build_words(char *, char *, int, int *); void test_words(char *, char *, int, topic_str *, int *); void expand_table(table_str **, int *); void write_words(FILE *, FILE *, struct RAB *, struct RAB *, struct RAB *, int *, int *, char *, topic_str *, sep_type, int *, char *, int); void load_words(char *, char *, table_str **, int *, int *); int is_noise(char *, int, int); int is_candidate(char *, int); int is_punct(char, char *); int is_spaces(char *, int, int); dsc *descr(char *); void parse_topic(char *, topic_str *); void parse_keyword(char *, topic_str *); void *my_realloc(void *, int); void index_commands(); int find_str(char *, char *); void parse_commands(dsc *, switch_str[]); int switch_present(char *); char *switch_value(char *); void lower(char *); void check_alloc(void *); int exclude(char *); void make_valid(char *); main(int argc, char *argv[]) { FILE *src, *lnk, *dummy; char *cp, *cp2, *ptr, desc[DESC_SIZE + 1], src_line[MAX_INPUT_LINE]; static char cli_input[256], punctuation[128], temp_punct[128]; static char value[20], file_arg[256], file_spec[256], out_name[256]; static char spec_name[256], spec_line[270], temp_number[20]; char orig_line[MAX_INPUT_LINE], lc_line[MAX_INPUT_LINE], temp_line[MAX_INPUT_LINE]; char spaces[DESC_SIZE + 1], help_index[10]; int start_pos, end_pos, status, index, word_pos, context = 0; sep_type type = whole; int dash_len = 0, ind, minimum_word, where, first_time = TRUE; int hex_value, field_pos = 1, field_size, zero, prefix; int max_lines = 0, read_lines = 0; /* jms/950422 */ short leng; char *dashes = NULL; struct FAB idxfab, selfab, posfab; struct RAB idxrab, selrab, posrab; struct XABKEY idxxab, selxab, posxab; $DESCRIPTOR(input_dsc, cli_input); $DESCRIPTOR(file_dsc, file_arg); $DESCRIPTOR(file_spec_dsc, file_spec); $DESCRIPTOR(out_dsc, out_name); $DESCRIPTOR(punct_dsc, temp_punct); $DESCRIPTOR(value_dsc, value); $DESCRIPTOR(spec_dsc, spec_name); $DESCRIPTOR(spec_line_dsc, spec_line); /* initialize the topics and keywords arrays */ for (index = 0; index < TOPIC_SIZE; index++) { topics[index].literal = NULL; topics[index].text = NULL; topics[index].end = NULL; topics[index].found = NULL; topics[index].deftext = NULL; topics[index].pos = 0; topics[index].size = -1; topics[index].offset = 0; topics[index].exclude = 0; topics[index].used = 0; topics[index].force_break = 0; topics[index].first = 0; topics[index].file = 0; keywords[index].literal = NULL; keywords[index].text = NULL; keywords[index].end = NULL; keywords[index].found = NULL; keywords[index].deftext = NULL; keywords[index].pos = 0; keywords[index].size = 0; keywords[index].offset = 0; keywords[index].exclude = 0; keywords[index].used = 0; keywords[index].force_break = 0; keywords[index].first = 0; keywords[index].file = 0; } /* * start up the CLI parse * add "index" to arg list and pass to cli$dcl_parse * this does not parse the individual qualifiers; that's done below */ status = lib$get_foreign(&input_dsc, 0, &leng, 0); for (ind = leng; ind >= 0; ind--) cli_input[ind+6] = cli_input[ind]; strncpy(cli_input, "index ", 6); input_dsc.dsc$w_length = leng+6; status = cli$dcl_parse(&input_dsc, index_commands); if (status != CLI$_NORMAL) /* error in parse, exit */ exit(7); /* no source file given; provide a little help */ if ((cli$present(descr("file")) & 1) == 0) { printf("BUILD_INDEX %s\n", VERSION); printf("Usage: index document\n"); printf(" /ADD update index files with source\n"); printf(" /CANDIDATES=file specify a file of words for index candidates\n"); printf(" /CHARACTER=n text separated by control character 'n'\n"); printf(" /COUNT=n stop after reading n lines\n"); printf(" /DASH=n text separated n dashes (default 3)\n"); printf(" /[NO]DEFAULT_TOPIC keep [discard] topics [not] matched by /TOPIC\n"); printf(" /EQUAL=n text separated n equals (default 80)\n"); printf(" /FF text separated by form feeds\n"); printf(" /FIELD=(position,size) specify topic break on field\n"); printf(" /HELPFILE=(selector,title) file to match query \"?\"\n"); printf(" /KEYWORD=(text,end,offset,exclude) specify indexing range\n"); printf(" /LINE each line is separate text entry\n"); printf(" /LINK[=SORT] generate .link file instead of .idx,.sel files\n"); printf(" /MAX_TOPICS=n maximum size of topic ID field (default 6)\n"); printf(" /MINIMUM_WORD=n define minimum word to index (default 3)\n"); printf(" /NOISE=file specify a file of words to omit in the index\n"); printf(" /NONUMBERS omit all numbers from the index\n"); printf(" /OUTPUT=file override name of index/selection files\n"); printf(" /PARAGRAPH text separated by blank lines\n"); printf(" /[NO]POSITION include [omit] word position information\n"); printf(" /PUNCTUATION=\"...\" specify the characters that separate words\n"); printf(" /SELECTOR=(text,end,default,both,ignore) specify selectors to generate\n"); printf(" /SEQUENTIAL create sequential files (.seqidx, .seqsel)\n"); printf(" /SPECIFICATION=file specify a file of qualifiers\n"); printf(" /TOPIC=(text,end,position,size,offset,exclude,break) specify topic names\n"); printf(" /URL=(text,end,default) specify selectors to generate\n"); printf(" /[NO]VERSION keep [discard] document version in selection\n"); printf(" /WHOLE whole file is one text entry\n"); printf(" /WORD_LENGTH=n maximum size of index key (default 20)\n"); exit(1); } /* if there is a /SPECIFICATION=file, read it */ if (cli$present(descr("specification")) & 1) { status = cli$get_value(descr("specification"), &spec_dsc, &leng); spec_name[leng] = '\0'; if ((spc = fopen(spec_name, "r")) == NULL) { printf("Can't read spec file %s\n", spec_name); exit(13); } /* and parse every line of the spec file */ while (fgets(spec_line, sizeof(spec_line), spc)) { if ((spec_line[0] == '\n') || (spec_line[0] == '#') || (spec_line[0] == '!')) continue; /* skip blank and comment lines */ ptr = strchr(spec_line, '\n'); if (ptr) *ptr = '\0'; leng = strlen(spec_line); for (ind = leng; ind >= 0; ind--) spec_line[ind+6] = spec_line[ind]; strncpy(spec_line, "index ", 6); spec_line_dsc.dsc$w_length = leng+6; parse_commands(&spec_line_dsc, switches); } } /* then parse any other qualifiers on the command line */ parse_commands(&input_dsc, switches); if (switch_present("paragraph")) type = para; if (switch_present("ff")) { type = hex; /* /FF same as /character=12 */ hex_value = '\f'; } if (switch_present("character")) { hex_value = atoi(switch_value("character")); type = hex; } if (switch_present("whole")) type = whole; if (switch_present("line")) type = line; if (switch_present("dash")) { dash_len = atoi(switch_value("dash")); type = dash; } if (switch_present("equal")) { dash_len = atoi(switch_value("equal")); type = equal; } if (switch_present("word_length")) { max_word = atoi(switch_value("word_length")); } if (switch_present("count")) { max_lines = atoi(switch_value("count")); } if (switch_present("field")) { type = field; field_pos = atoi(switch_value("field.position")); field_size = atoi(switch_value("field.size")); } strcpy(punctuation, PUNCT_CHARS); /* default for /punctuation */ if (switch_present("punctuation")) { strcpy(temp_punct, switch_value("punctuation")); if (temp_punct[0] == '"') { /* if quoted string */ strncpy(punctuation, temp_punct+1, leng-2); punctuation[leng-2] = '\0'; } else if (strlen(temp_punct) > 0) strcpy(punctuation, temp_punct); } else { /* /nopunctuation="$" means exclude $ from punct chars */ if (cp = switch_value("punctuation")) strcpy(temp_punct, cp); else strcpy(temp_punct, ""); if (temp_punct[0] == '"') { /* if quoted string */ strcpy(temp_punct, temp_punct+1); temp_punct[leng-2] = '\0'; } for (cp = temp_punct; *cp; cp++) { cp2 = strchr(punctuation, *cp); if (cp2) strcpy(cp2, cp2+1); /* remove character from punctuation */ } } if (switch_present("max_topics")) { /* query assumes topic = wid = count */ max_topic = max_count = max_wid = max_pos = atoi(switch_value("max_topics")); if (max_topic > 9) { printf("/MAX_TOPICS specifies the number of digits in the topic number field.\n"); printf("A 32 bit system cannot handle integers greater than 9 digits.\n"); exit(9); } } if (switch_present("minimum_word")) minimum_word = atoi(switch_value("minimum_word")); sequential = (switch_present("sequential")); strcpy(file_arg, switch_value("file")); /* get source */ file_dsc.dsc$w_length = (short) strlen(file_arg); /* set the descriptor length */ strncpy(file_spec, "", sizeof(file_spec)); /* clear out file_spec */ /* in the case of wildcard file names, lib$find_file will expand them */ status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); if ((status & 1) == 0) { printf("lib$find_file failed: status %X\n", status); exit(11); } ptr = strchr(file_spec, ' '); if (ptr) *ptr = '\0'; /* chop off trailing spaces */ strcpy(out_name, file_spec); /* make copy for output spec */ if (switch_present("output")) /* if /output, overwrite out_name */ strcpy(out_name, switch_value("output")); words_size = words_index = 0; /* no words yet */ words = NULL; noise_size = noise_index = 0; /* no noise yet */ noise = NULL; candidate_size = candidate_index = 0; /* no candidate yet */ candidate = NULL; if (!switch_present("link")) { /* load noise words */ load_words("noise", punctuation, &noise, &noise_size, &noise_index); /* load candidate words */ load_words("candidates", punctuation, &candidate, &candidate_size, &candidate_index); } /* dynamically allocate space and constants that depend on switches */ dashes = (char *) malloc(dash_len+1); memset((void *) dashes, (type==dash) ? '-' : '=', dash_len); dashes[dash_len] = '\0'; memset((void *) spaces, ' ', DESC_SIZE); /* make spaces for padding topic */ spaces[DESC_SIZE] = '\0'; idx_record = (char *) calloc(max_word + max_count + max_topic + max_wid + 1, sizeof(char)); idx_key = (char *) calloc(max_word + 1, sizeof(char)); pos_record = (char *) calloc(max_wid + max_pos + 1, sizeof(char)); prev_keyword = (char *) calloc(max_word + 1, sizeof(char)); if (sequential && ((max_topic != 6) || (max_word != 20))) { printf("WARNING: Your FDL file will need to reflect the following:\n"); printf("IDX file record length: %d\n", max_word + max_topic * 3); printf("IDX file key 0 size: %d\n", max_word + max_topic); printf("SEL file key 0 size: %d\n", max_topic); printf("POS file record length: %d\n", max_wid + max_pos); printf("POS file key 0 size: %d\n", max_wid); } /* set up index file */ if (!switch_present("link")) { ptr = strrchr(out_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; if (sequential) strcat(out_name, ".seqidx"); else strcat(out_name, ".idx"); } idxfab = cc$rms_fab; idxfab.fab$l_alq = 100; idxfab.fab$b_bks = 3; idxfab.fab$w_deq = 25; idxfab.fab$b_fac = FAB$M_PUT | FAB$M_GET | FAB$M_DEL; idxfab.fab$l_fna = out_name; idxfab.fab$b_fns = strlen(out_name); idxfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; idxfab.fab$w_mrs = max_word + max_topic + max_wid + max_count; idxfab.fab$b_org = sequential ? FAB$C_SEQ : FAB$C_IDX; idxfab.fab$b_rat = FAB$M_CR; idxfab.fab$b_rfm = FAB$C_FIX; idxfab.fab$b_shr = FAB$M_NIL; idxfab.fab$l_xab = (char *) &idxxab; idxrab = cc$rms_rab; idxrab.rab$l_fab = (struct FAB *) &idxfab; idxrab.rab$b_krf = 0; idxrab.rab$l_kbf = idx_key; idxrab.rab$b_ksz = max_word; idxrab.rab$b_rac = sequential ? RAB$C_SEQ : RAB$C_KEY; idxrab.rab$l_rbf = idx_record; idxrab.rab$w_rsz = max_word + max_topic + max_wid + max_count; idxrab.rab$l_ubf = idx_record; idxrab.rab$w_usz = max_word + max_topic + max_wid + max_count; idxrab.rab$b_mbf = 20; idxrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH; idxxab = cc$rms_xabkey; idxxab.xab$b_dtp = XAB$C_STG; idxxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR; idxxab.xab$w_pos0 = 0; idxxab.xab$b_siz0 = max_word + max_topic; idxxab.xab$b_ref = 0; if (!switch_present("link")) { if (switch_present("add")) { if (((status = sys$open(&idxfab)) & 1) != SS$_NORMAL) lib$stop(status); } else { if (((status = sys$create(&idxfab)) & 1) != SS$_NORMAL) lib$stop(status); } if (((status = sys$connect(&idxrab)) & 1) != SS$_NORMAL) lib$stop(status); /* set up selector file */ ptr = strrchr(out_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; if (sequential) strcat(out_name, ".seqsel"); else strcat(out_name, ".sel"); } selfab = cc$rms_fab; selfab.fab$l_alq = 10; selfab.fab$b_bks = 3; selfab.fab$w_deq = 5; selfab.fab$b_fac = FAB$M_PUT; selfab.fab$l_fna = out_name; selfab.fab$b_fns = strlen(out_name); selfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; selfab.fab$w_mrs = max_topic + DESC_SIZE + SELECTOR_SIZE; selfab.fab$b_org = sequential ? FAB$C_SEQ : FAB$C_IDX; selfab.fab$b_rat = FAB$M_CR; selfab.fab$b_rfm = FAB$C_VAR; selfab.fab$b_shr = FAB$M_NIL; selfab.fab$l_xab = (char *) &selxab; selrab = cc$rms_rab; selrab.rab$l_fab = (struct FAB *) &selfab; selrab.rab$b_rac = sequential ? RAB$C_SEQ : RAB$C_KEY; selrab.rab$l_rbf = sel_record; selrab.rab$b_mbf = 20; selrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH | RAB$M_EOF; selxab = cc$rms_xabkey; selxab.xab$b_dtp = XAB$C_STG; selxab.xab$b_flg = XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR; selxab.xab$w_pos0 = 0; selxab.xab$b_siz0 = max_topic; selxab.xab$b_ref = 0; if (!switch_present("link")) { if (switch_present("add")) { if (((status = sys$open(&selfab)) & 1) != SS$_NORMAL) lib$stop(status); } else { if (((status = sys$create(&selfab)) & 1) != SS$_NORMAL) lib$stop(status); } if (((status = sys$connect(&selrab)) & 1) != SS$_NORMAL) lib$stop(status); /* set up position file */ ptr = strrchr(out_name, '.'); /* just get file name */ if (ptr) *ptr = '\0'; if (sequential) strcat(out_name, ".seqpos"); else strcat(out_name, ".pos"); } posfab = cc$rms_fab; posfab.fab$l_alq = 100; posfab.fab$b_bks = 3; posfab.fab$w_deq = 25; posfab.fab$b_fac = FAB$M_PUT; posfab.fab$l_fna = out_name; posfab.fab$b_fns = strlen(out_name); posfab.fab$l_fop = FAB$M_CBT | FAB$M_DFW; posfab.fab$w_mrs = max_wid + max_pos; posfab.fab$b_org = sequential ? FAB$C_SEQ : FAB$C_IDX; posfab.fab$b_rat = FAB$M_CR; posfab.fab$b_rfm = FAB$C_FIX; posfab.fab$b_shr = FAB$M_NIL; posfab.fab$l_xab = (char *) &posxab; posrab = cc$rms_rab; posrab.rab$l_fab = (struct FAB *) &posfab; posrab.rab$b_rac = sequential ? RAB$C_SEQ : RAB$C_KEY; posrab.rab$l_rbf = pos_record; posrab.rab$w_rsz = max_wid + max_pos; posrab.rab$l_ubf = pos_record; posrab.rab$w_usz = max_wid + max_pos; posrab.rab$b_mbf = 20; posrab.rab$l_rop = RAB$M_RAH | RAB$M_WBH | RAB$M_EOF; posxab = cc$rms_xabkey; posxab.xab$b_dtp = XAB$C_STG; posxab.xab$b_flg = XAB$M_DUP | XAB$M_DAT_NCMPR | XAB$M_IDX_NCMPR; posxab.xab$w_pos0 = 0; posxab.xab$b_siz0 = max_wid; posxab.xab$b_ref = 0; selector_spec = (char *) calloc(1, sizeof(char)); /* make empty spec */ url_spec = (char *) calloc(1, sizeof(char)); url_fragment = (char *) calloc(1, sizeof(char)); if (!switch_present("link")) { if (switch_present("position")) { if (switch_present("add")) { if (((status = sys$open(&posfab)) & 1) != SS$_NORMAL) lib$stop(status); } else { if (((status = sys$create(&posfab)) & 1) != SS$_NORMAL) lib$stop(status); } if (((status = sys$connect(&posrab)) & 1) != SS$_NORMAL) lib$stop(status); } /* if /helpfile given, set up index values */ if (switch_present("helpfile.selector")) { selector_spec = (char *) calloc(strlen(switch_value("helpfile.selector")) + 1, sizeof(char)); strcpy(selector_spec, switch_value("helpfile.selector")); if (switch_present("helpfile.title")) strcpy(desc, switch_value("helpfile.title")); else strcpy(desc, "Help on search commands"); word_pos = 0; strcpy(help_index, "?"); build_words(help_index, "", 0, &word_pos); /* add "?" to the index */ strcpy(help_index, "?help"); build_words(help_index, "", 0, &word_pos); /* add "?help" to the index */ dummy = NULL; write_words((FILE *) 0, dummy, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); /* write helpfile */ } } else { /* /link */ if (*out_name != '.') { /* if the output name has no leading dot */ ptr = strrchr(out_name, '.'); if (ptr) *ptr = '\0'; /* replace any file type with .link */ strcat(out_name, ".link"); } lnk = fopen(out_name, "w", "mbc=50", "mbf=20"); if (!switch_present("link.sort")) fprintf(lnk, "Sortdir=False\n\n"); } if (switch_present("add")) { if (idxxab.xab$b_siz0 != (max_word + max_count)) { printf("Source and index file /MAX_TOPIC & /WORD_SIZE do not match\n"); exit(13); } /* find previous magic cookie in the index file */ if (switch_present("sequential")) { find_eof(&idxrab); status = sys$get(&idxrab); if ((status & 1) != SS$_NORMAL) lib$stop(status); } else { char *record_copy; strcpy(idx_key, MAGIC_COOKIE); idxrab.rab$l_kbf = idx_key; idxrab.rab$b_ksz = strlen(idx_key); record_copy = (char *) calloc(max_word + max_count + max_topic + max_wid + 1, sizeof(char)); check_alloc(record_copy); for (;;) { status = sys$get(&idxrab); if (((status & 1) == SS$_NORMAL) && (strncmp(idx_record, MAGIC_COOKIE, strlen(MAGIC_COOKIE)) == 0)) strcpy(record_copy, idx_record); else break; idxrab.rab$b_rac = RAB$C_SEQ; } strcpy(idx_record, record_copy); free(record_copy); /* should be cfree, but alpha chokes */ idxrab.rab$b_rac = RAB$C_KEY; } if (strncmp(idx_record, MAGIC_COOKIE, strlen(MAGIC_COOKIE))) { printf("Not Magic: %s\n", idx_record); exit(17); } strncpy(temp_number, "", sizeof(temp_number)); strncpy(temp_number, idx_record + max_word, max_count); db_index = atoi(temp_number); strncpy(temp_number, "", sizeof(temp_number)); strncpy(temp_number, idx_record + max_word + max_count, max_topic); word_index = atoi(temp_number); } /* if /NOPOSITION, force word index to 0 */ if (!switch_present("position")) word_index = 0; for (;;) { /* process all files in input spec, first one already found */ if (first_time) { /* skip the lib$find_file the first time */ first_time = FALSE; status = 1; } else status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); if (status == RMS$_NMF) { /* no more files */ lib$find_file_end(&context); if ((ptr = switch_value("file")) == NULL) break; /* no file names left */ strcpy(file_arg, ptr); /* get source */ file_dsc.dsc$w_length = (short) strlen(file_arg); /* set the descriptor length */ strncpy(file_spec, "", sizeof(file_spec)); /* clear out file_spec */ status = lib$find_file(&file_dsc, &file_spec_dsc, &context, 0, 0, 0, 0); } ptr = strchr(file_spec, ' '); if (ptr) *ptr = '\0'; /* chop off trailing spaces */ if ((status & 1) == 0) { printf("lib$find_file error %X on %s\n", status, file_spec); continue; } if (exclude(file_spec)) /* should we exclude this file? */ continue; /* yes */ if ((src = fopen(file_spec, "r", "mbc=50", "mbf=20")) == NULL) { printf("Can't read input file %s\n", file_spec); continue; } if (!switch_present("quiet")) printf("Building index for %s\n", file_spec); start_pos = ftell(src); /* init start position */ strncpy(desc, "", sizeof(desc)); word_pos = 0; while (fgets(src_line, sizeof(src_line), src)) { /* If we've read too many lines, then break out jms/950422 */ read_lines++; if ( (max_lines) && (read_lines > max_lines) ) { if (!switch_present("quiet")) printf(" Finishing early because maximum line count reached\n"); /* skip all of the lines until the last line, and then * replace the line we just read with that one. Continue * on. The error will get re-echoed when we go back to the * top of the loop, so we don't have to maintain any * icky state information. jms/950626 */ /* strncpy(src_line, skip_to_end_of_file(src), sizeof(src_line)); */ break; } /* if the first character of the line is the hex value, end topic */ if ((src_line[0] == hex_value) && (type == hex)) { write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); continue; } ptr = strchr(src_line, '\n'); if (ptr) *ptr = '\0'; /* remove newline */ for (ptr = src_line; *ptr; ptr++) if (iscntrl(*ptr)) *ptr = ' '; /* convert tabs to spaces */ while ((strlen(src_line) > 0) && (src_line[strlen(src_line)-1] == ' ')) src_line[strlen(src_line)-1] = '\0';/* remove trailing blanks */ strcpy(orig_line, src_line); /* copy before forcing lower case */ lower(src_line); /* force lowercase */ strcpy(lc_line, src_line); /* copy with leading blanks */ for (ptr = src_line; *ptr; ptr++) if (*ptr > ' ') break; /* find first non-blank char */ strcpy(src_line, ptr); /* remove leading blanks */ /* break on dashes */ if (((type == equal) || (type == dash)) && (strncmp(orig_line, dashes, dash_len) == 0)) { write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); continue; } /* break on paragraph */ if ((type == para) && (strlen(src_line) == 0)) { write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); continue; } /* break on non-empty field */ if ((type == field) && !is_spaces(orig_line, field_pos, field_size)) { write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); start_pos = end_pos; /* don't skip over line with field break */ } /* apply topic rules */ for (index = 0; topics[index].used; index++) { where = topics[index].pos; /* where text is found */ if (topics[index].deftext && (strlen(desc) == 0)) strncpy(desc, topics[index].deftext, DESC_SIZE); if (/* if we have no match or don't want the first match */ ((topics[index].found == NULL) || (strlen(topics[index].found) == 0) || (topics[index].first == FALSE)) && /* if text matches the source text and position */ (((topics[index].pos > 0) && topics[index].text && strncmp(lc_line + topics[index].pos - 1, topics[index].text, strlen(topics[index].text)) == 0) || /* or position = 0 and text is found _somewhere_ */ ((topics[index].pos == 0) && (where = find_str(lc_line, topics[index].text))) || /* or no text given but position and size field is non-blank */ (!topics[index].text && !is_spaces(orig_line, topics[index].pos, topics[index].size))) ) { /* if topic matches and requested a break, do it */ if (topics[index].force_break) { type = force; /* override other types */ write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); start_pos = end_pos; /* don't skip over topic line */ } /* make copy of line at start of topic text */ if (topics[index].exclude) strcpy(temp_line, orig_line + where - 1 + strlen(topics[index].text)); else strcpy(temp_line, orig_line + where - 1); topics[index].found = (char *) my_realloc((char *) topics[index].found, ((topics[index].size > -1) ? topics[index].size : strlen(orig_line)) + 1); if (topics[index].size > -1) { /* want fixed topic size */ strncpy(topics[index].found, temp_line + topics[index].offset, topics[index].size); topics[index].found[topics[index].size] = '\0'; strncat(topics[index].found, spaces, topics[index].size - strlen(topics[index].found)); } else { /* copy to end of topic */ strcpy(topics[index].found, temp_line + topics[index].offset); if (where = find_str(topics[index].found, topics[index].end)) topics[index].found[where - 1] = '\0'; /* terminate the found string */ } } } /* save the first line by default */ if ((switch_present("default_topic")) && (strlen(desc) == 0)) strncpy(desc, orig_line, DESC_SIZE); /* apply selector rules */ if (switch_present("selector.text") && (where = find_str(lc_line, switch_value("selector.text")))) { selector_spec = (char *) my_realloc((char *) selector_spec, strlen(orig_line) + 1); strcpy(selector_spec, orig_line + where - 1 + strlen(switch_value("selector.text"))); if (switch_present("selector.end") && /* if selector=end given */ (where = find_str(selector_spec, switch_value("selector.end")))) selector_spec[where - 1] = '\0'; /* mark selector end */ while (*selector_spec == ' ') /* remove leading spaces */ strcpy(selector_spec, selector_spec + 1); continue; /* do not index this line */ } /* URL rules */ if (switch_present("url.text") && (where = find_str(lc_line, switch_value("url.text")))) { if (switch_present("url.prefix")) prefix = strlen(switch_value("url.prefix")); else prefix = 0; url_spec = (char *) my_realloc((char *) url_spec, prefix + strlen(orig_line) + 1); /* prefix url */ strcpy(url_spec, switch_present("url.prefix") ? switch_value("url.prefix") : ""); /* plus url found */ strcat(url_spec, orig_line + where - 1 + strlen(switch_value("url.text"))); if (switch_present("url.end") && /* if url=end given */ (where = find_str(url_spec, switch_value("url.end")))) url_spec[where - 1] = '\0'; /* mark url end */ while (*url_spec == ' ') /* remove leading spaces */ strcpy(url_spec, url_spec + 1); continue; /* do not index this line */ } if (switch_present("url.fragment") && (where = find_str(lc_line, switch_value("url.fragment")))) { url_fragment = (char *) my_realloc((char *) url_fragment, strlen(orig_line) + 1); /* save fragment name for later */ strcpy(url_fragment, orig_line + where - 1 + strlen(switch_value("url.fragment"))); while(strlen(url_fragment)) if (!isalpha(*url_fragment)) strcpy(url_fragment, url_fragment+1); else break; for (ptr = url_fragment; *ptr; ptr++) if (!isalnum(*ptr) && (*ptr != '-')) { *ptr = '\0'; /* fragment name must be alphanum or '-' */ break; } } /* apply keyword rules, index words */ if (!switch_present("link")) test_words(src_line, punctuation, minimum_word, keywords, &word_pos); end_pos = ftell(src); /* end_pos points before any terminator */ /* force topic break if in line mode */ if (type == line) write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); } /* in case file doesn't end with a terminator */ write_words(src, lnk, &selrab, &idxrab, &posrab, &start_pos, &end_pos, desc, topics, type, &word_pos, punctuation, minimum_word); fclose(src); } if (switch_present("link")) fclose(lnk); else { /* write out the values of db_index and word_index with a magic cookie */ strcpy(idx_key, MAGIC_COOKIE); idxrab.rab$l_kbf = idx_key; idxrab.rab$b_ksz = strlen(idx_key); status = sys$get(&idxrab); /* find old magic cookie */ if ((status & 1) == SS$_NORMAL) { status = sys$delete(&idxrab); /* and delete it */ if ((status & 1) != SS$_NORMAL) lib$stop(status); } sprintf(idx_record, "%-*s%0*d%0*d%0*d", max_word, MAGIC_COOKIE, max_topic, db_index, max_wid, word_index, max_count, 0); idxrab.rab$w_rsz = strlen(idx_record); status = sys$put(&idxrab); /* write new magic cookie */ if ((status & 1) != SS$_NORMAL) lib$stop(status); sys$close(&selfab); sys$close(&idxfab); sys$close(&posfab); } /* * don't bother freeing memory, just exit * * for (ind = 0; ind < words_size; ind++) * if (words[ind].str) cfree(words[ind].str); * free(words); */ } /* perform binary search on sequential index file */ /* since the index file is fixed length we can perform relative key lookups */ void find_eof(struct RAB *idxptr) { int status, key, interval = 500, bottom = 0, top = 0; idxptr->rab$l_kbf = (char *) &key; idxptr->rab$b_ksz = 4; idxptr->rab$b_rac = RAB$C_KEY; for (;;) { key = bottom + interval; status = sys$find(idxptr); if ((status & 1) != SS$_NORMAL) break; bottom += interval; } top = bottom + interval; for (;;) { interval = (top - bottom)/2; if (interval == 0) break; key = bottom + interval; status = sys$find(idxptr); if ((status & 1) == SS$_NORMAL) bottom += interval; else top -= interval; } /* if we're past EOF, backup to the last record */ while ((status & 1) != SS$_NORMAL) { key--; status = sys$find(idxptr); } idxptr->rab$b_rac = RAB$C_SEQ; } /* test for start/end of keyword indexing */ void test_words(char *line, char *punct, int minimum_word, topic_str *keywords, int *word_pos) { char test_line[MAX_INPUT_LINE], copy_line[MAX_INPUT_LINE]; int ind, where, retry; if (!switch_present("keyword")) { /* no /keyword= */ build_words(line, punct, minimum_word, word_pos); /* index everything */ return; } if ((keyword_index > -1) && ((keywords[keyword_index].end == NULL) || (strlen(keywords[keyword_index].end) == 0))) keyword_index = -1; /* keyword indexing stops at EOL unless /keyword=end */ strcpy(test_line, line); /* copy source line */ do { where = 0; retry = FALSE; if (keyword_index == -1) { /* between keywords */ for (ind = 0; keywords[ind].used; ind++) if (where = find_str(test_line, keywords[ind].text)) break; if (where) { keyword_index = ind; /* record current keyword */ strcpy(test_line, test_line + where - 1 + keywords[ind].offset); /* remove up to keyword */ if (keywords[ind].exclude) strcpy(test_line, test_line + strlen(keywords[ind].text)); } } if (keyword_index > -1) /* in keyword index */ if (where = find_str(test_line, keywords[keyword_index].end)) { strcpy(copy_line, test_line); test_line[where - 1] = '\0'; /* index contents of line */ build_words(test_line, punct, minimum_word, word_pos); strcpy(test_line, copy_line + where - 1); /* restart at end word */ keyword_index = -1; /* no longer indexing */ retry = TRUE; /* check for another keyword */ } else { /* indexing and no end word found */ /* index contents of line */ build_words(test_line, punct, minimum_word, word_pos); return; } } while (retry); } /* break line into words and save them in words[] */ void build_words(char *line, char *punct, int minimum_word, int *pos) { char *cp, *cp2; for (cp = line; *cp; cp++) /* convert punctuation to spaces */ if (is_punct(*cp, punct)) *cp = ' '; strcat(line, " "); /* line ends with a space */ cp = line; while(cp2 = strchr(cp, ' ')) { /* break at space boundary */ *cp2 = '\0'; if (strlen(cp) > max_word) if (!switch_present("quiet")) printf("Truncating %d character word (%s) to %d characters\n", strlen(cp), cp, max_word); if ((strlen(cp) > 0) && (((candidate_size == 0) && !is_noise(cp, noise_size, minimum_word)) || (candidate_size && is_candidate(cp, candidate_size)))) { if (words_index == words_size) /* table full */ expand_table(&words, &words_size); strncpy(words[words_index].str, cp, max_word); words[words_index++].pos = ++(*pos); } cp = cp2 + 1; } } /* expand *table[] by CHUNK elements of max_word characters */ void expand_table(table_str **table, int *size) { int ind; *table = (table_str *) my_realloc((table_str *) *table, (*size + CHUNK) * sizeof(table_str)); for (ind = 0; ind < CHUNK; ind++) { (*table)[*size + ind].str = (char *) calloc(max_word + 1, sizeof(char)); check_alloc((*table)[*size + ind].str); (*table)[*size + ind].pos = -1; } *size += CHUNK; } /* return base to exp power */ int power(int base, int exp) { int result; result = base; while (--exp) result *= base; return result; } /* qsort compare routine */ int compare (const void *str1, const void *str2) { return (strcmp((*(table_str *) str1).str, (*(table_str *) str2).str)); } /* write out the index entries */ write_index(struct RAB *idxptr, struct RAB *posptr) { int ind, inc, status, dup_count; /* write out the words */ /* sort keys for counts and $put performance */ qsort(words, words_index, sizeof(table_str), compare); dup_count = 1; strcpy(prev_keyword, words[0].str); for (ind = 1; ind < words_index; ind++) { if (strcmp(words[ind].str, prev_keyword) == 0) { dup_count++; inc = 0; } else { sprintf(idx_record, "%-*s%0*d%0*d%0*d", max_word, prev_keyword, max_topic, db_index, max_wid, word_index, max_count, dup_count); status = sys$put(idxptr); if ((status & 1) != SS$_NORMAL) lib$stop(status); strcpy(prev_keyword, words[ind].str); dup_count = 1; inc = 1; } if (switch_present("position")) { sprintf(pos_record, "%0*d%0*d", max_wid, word_index, max_pos, words[ind-1].pos); status = sys$put(posptr); if ((status & 1) != SS$_NORMAL) lib$stop(status); word_index += inc; } } /* write out the last word */ sprintf(idx_record, "%-*s%0*d%0*d%0*d", max_word, prev_keyword, max_topic, db_index, max_wid, word_index, max_count, dup_count); status = sys$put(idxptr); if ((status & 1) != SS$_NORMAL) lib$stop(status); if (switch_present("position")) { sprintf(pos_record, "%0*d%0*d", max_wid, word_index, max_pos, words[ind-1].pos); status = sys$put(posptr); if ((status & 1) != SS$_NORMAL) lib$stop(status); word_index++; } } /* reset topic context */ void reset_topic(FILE *src, char *desc, int *start_pos, topic_str *topics, int *word_pos) { int ind; strncpy(desc, "", sizeof(desc)); *start_pos = ftell(src); /* init start position */ for (ind = 0; ind < words_index; ind++) /* clear out words[] */ *words[ind].str = '\0'; words_index = 0; *word_pos = 0; for (ind = 0; ind < TOPIC_SIZE; ind++) /* clear out topics[] */ if (topics[ind].found) *topics[ind].found = '\0'; } int test_size(char *desc, char *spec, int size) { if (strlen(desc)+strlen(spec)+size+max_topic > sizeof(sel_record)-1) { printf("Selector is too large: %s\n", spec); return 0; } return 1; } /* take the URL from url_spec and add it to the selector */ /* merge filename with default url if no url given */ /* note: doesn't handle duplication of host, port between URL and selector */ void add_url(char *rec, char *filename) { char method[100], host[200], path[300], new[500]; char *cp, *cp2 = NULL, *ptr; /* parse the URL */ strncpy(method, "", sizeof(method)); strncpy(host, "", sizeof(host)); strncpy(path, "", sizeof(path)); if ((cp=strstr(url_spec, "://")) != NULL) strncpy(method, url_spec, cp-url_spec); if ((cp=strstr(url_spec, "//")) != NULL) { cp2 = strchr(cp+2, '/'); if (cp2) { strncpy(host, cp+2, cp2-(cp+2)); host[cp2-(cp+2)] = '\0'; } else strcpy(host, cp+2); } if (cp2 == NULL) cp2 = url_spec; strcpy(path, cp2); /* if a partial selector = URL default, merge in real file name */ ptr = switch_value("url.default"); if (ptr && (strcmp(url_spec, ptr) == 0) && (url_spec[strlen(url_spec)-1] == '/')) strcat(path, strchr(filename, ']')+1); if (strlen(url_fragment)) { strcat(path, "#"); strcat(path, url_fragment); } if (strlen(path)) { sprintf(new, "|%s|%s|%s", host, method, path); if (test_size(rec, new, 10)) strcat(rec, new); } } /* write out file selector then write out words */ void write_words(FILE *src, FILE *lnk, struct RAB *selptr, struct RAB *idxptr, struct RAB *posptr, int *start_pos, int *end_pos, char *desc, topic_str *topics, sep_type type, int *word_pos, char *punct, int minimum_word) { int ind, status, new_desc; char filename[256], *ptr, temp_desc[512] = "", end_char; char hostname[256], portname[10], ptype[25], path[SELECTOR_SIZE]; keyword_index = -1; /* stop indexing at end of section */ if (!switch_present("link")) { /* may only want to do topic.literal + selector */ /* if the topic is really empty, the test for an empty description will fail */ #if 0 if (words_index == 0) { reset_topic(src, desc, start_pos, topics, word_pos); return; /* no words to write */ } #endif if ((word_index + 2) >= power(10, max_topic)) { printf("You have reached %d words in this index\n", word_index); printf("Please re-index with /MAX_TOPIC larger than %d ", max_topic); printf("or use /NOPOSITION\n"); exit(5); } if ((*word_pos + 2) >= power(10, max_topic)) { printf("You have reached %d words in this document\n", *word_pos); printf("Please re-index with /MAX_TOPIC larger than %d ", max_topic); printf("or use /NOPOSITION\n"); exit(5); } } if (src != NULL) fgetname(src, filename); /* if /noversion, get rid of the version number */ if (!switch_present("version")) { ptr = strchr(filename, ';'); if (ptr) *ptr = '\0'; } lower(filename); /* force filename lowercase */ new_desc = 0; /* build topic description */ for (ind = 0; ind < TOPIC_SIZE; ind++) { if (topics[ind].literal && (strlen(topics[ind].literal) > 0)) { strcat(temp_desc, topics[ind].literal); new_desc = 1; } if (topics[ind].found && (strlen(topics[ind].found) > 0)) { /* literals can handle their own spacing */ if ((strlen(temp_desc) > 0) && (topics[ind].literal == NULL)) strcat(temp_desc, " "); strcat(temp_desc, topics[ind].found); } if (topics[ind].file == 1) { /* just name.type */ strcat(temp_desc, strchr(filename, ']')+1); } if (topics[ind].file == 2) { /* full file spec */ strcat(temp_desc, filename); } } if (strlen(temp_desc) > 0) strncpy(desc, temp_desc, DESC_SIZE); if (new_desc) /* we have added new words via topic.literal */ build_words(temp_desc, punct, minimum_word, word_pos); /* index them */ if (words_index == 0) { reset_topic(src, desc, start_pos, topics, word_pos); return; /* no words to write */ } /* if no selector found, copy in default selector */ if ((strlen(selector_spec) == 0) && switch_present("selector.default")) { selector_spec = (char *) my_realloc((char *) selector_spec, strlen(switch_value("selector.default")) + 1); strcpy(selector_spec, switch_value("selector.default")); } /* ditto for URLs */ if ((strlen(url_spec) == 0) && switch_present("url.default")) { url_spec = (char *) my_realloc((char *) url_spec, strlen(switch_value("url.default")) + 1); strcpy(url_spec, switch_value("url.default")); } if ((strlen(desc) == 0) || /* if no description or */ ((switch_present("selector.text") && !switch_present("selector.both") && /* selectors only */ (strlen(selector_spec) == 0))) || /* and no selector found or */ ((switch_present("url.text") && !switch_present("url.both") && /* urls only */ (strlen(url_spec) == 0)))) { /* and no url found */ reset_topic(src, desc, start_pos, topics, word_pos); /* reset topic stuff */ return; /* and quit */ } strncpy(hostname, "", sizeof(hostname)); /* init hostname */ strncpy(portname, "", sizeof(portname)); /* init portname */ /* we have a selector that applies to this article */ if (strlen(selector_spec)) { if (switch_present("selector.ignore") && /* should we ignore this? */ (strchr(switch_value("selector.ignore"), *selector_spec))) { reset_topic(src, desc, start_pos, topics, word_pos); return; /* yes, ignore this */ } /* parse out the host and port from the selector */ if (ptr = strchr(selector_spec, '|')) { *ptr = '\0'; /* mark off selector from host */ sprintf(hostname, "%s", ptr + 1); /* copy host/port */ if (ptr = strchr(hostname, '|')) { *ptr = '\0'; /* mark off port from host */ sprintf(portname, "%s", ptr + 1); } } /* /selector and /link */ if (switch_present("link")) { fprintf(lnk, "Name=%s\nType=%c\n", desc, *selector_spec); fprintf(lnk, "Path=%s\n", selector_spec+1); fprintf(lnk, "Host=%s\nPort=%s\n\n", strlen(hostname) ? hostname + 1 : "+", strlen(portname) ? portname + 1 : "+"); } else if (test_size(desc, selector_spec, strlen(hostname) + strlen(portname) + 1)) { /* if a partial selector, merge in real file name */ /* except if it is a gopher directory (gtype "1") */ end_char = selector_spec[strlen(selector_spec)-1]; if (end_char == ']' && *selector_spec != '1' ) strcat(selector_spec, strchr(filename, ']')+1); ptype[0] = *(selector_spec+1); /* break selector into ptype and path */ if (*(selector_spec+1) != 'R') if (*(selector_spec+1) == '\0') ptr = selector_spec+1; /* null ptype = null path */ else ptr = selector_spec+2; else { ptr = strchr(selector_spec+2, '-'); ptr = strchr(ptr+1, '-') + 1; *(ptr-1) = '\0'; } strncpy(ptype, "", sizeof(ptype)); strncpy(ptype, selector_spec+1, ptr-selector_spec+1); strcpy(path, ptr); sprintf(sel_record, "%0*d%s\t|%c|%s|%s|%s%s%s", max_topic, ++db_index, desc, *selector_spec, ptype, path, hostname, strlen(portname) ? ":" : "", portname); if (strlen(url_spec)) add_url(sel_record, filename); /* add URL to selector */ selptr->rab$w_rsz = strlen(sel_record); if (((status = sys$put(selptr)) & 1) != SS$_NORMAL) lib$stop(status); write_index(idxptr, posptr); } } /* no selector or we want to include the default selector too */ if ((strlen(selector_spec) == 0) || switch_present("selector.both")) { /* handle /link */ if (switch_present("link")) { fprintf(lnk, "Name=%s\nType=0\n", desc); if (type == whole) /* whole file is a special case */ fprintf(lnk, "Path=0%s\n", filename); else fprintf(lnk, "Path=R%d-%d-%s\n", *start_pos, *end_pos, filename); fprintf(lnk, "Port=+\nHost=+\n\n"); } else { /* write out the selector */ if (test_size(desc, filename, 20)) { if (type == whole) /* whole file is a special case */ sprintf(sel_record, "%0*d%s\t|0|0|%s", max_topic, ++db_index, desc, filename); else sprintf(sel_record, "%0*d%s\t|0|R%d-%d|%s", max_topic, ++db_index, desc, *start_pos, *end_pos, filename); if (strlen(url_spec)) add_url(sel_record, filename); /* add URL to selector */ selptr->rab$w_rsz = strlen(sel_record); if (((status = sys$put(selptr)) & 1) != SS$_NORMAL) lib$stop(status); write_index(idxptr, posptr); } } } strcpy(selector_spec, ""); /* reset the current selector */ strcpy(url_spec, ""); strcpy(url_fragment, ""); if (!switch_present("quiet")) printf("%s\n", desc); /* clear words[], topics, etc. */ reset_topic(src, desc, start_pos, topics, word_pos); } /* read in a file of words */ void load_words(char *name, char *punct, table_str **table, int *table_size, int *table_index) { FILE *nf; char *cp, *cp2, line[MAX_INPUT_LINE]; static char file_name[256]; short leng; int status; $DESCRIPTOR(name_dsc, file_name); if (!switch_present(name)) return; strcpy(file_name, switch_value(name)); if ((nf = fopen(file_name, "r")) == NULL) { printf("Can't read data file %s\n", file_name); return; } while (fgets(line, sizeof(line), nf)) { cp = strchr(line, '\n'); if (cp) *cp = '\0'; /* remove newline */ for (cp = line; *cp; cp++) { if (is_punct(*cp, punct) || iscntrl(*cp)) *cp = ' '; /* convert punctuation, tabs to spaces */ *cp = tolower(*cp); /* force lowercase */ } while ((strlen(line) > 0) && (line[strlen(line)-1] == ' ')) line[strlen(line)-1] = '\0'; /* remove trailing blanks */ for (cp = line; *cp; cp++) if (*cp > ' ') break; /* find first non-blank char */ strcpy(line, cp); /* remove leading blanks */ strcat(line, " "); /* line ends with a space */ cp = line; while(cp2 = strchr(cp, ' ')) { /* break at space boundary */ *cp2 = '\0'; if (strlen(cp) > 0) { if (*table_index == *table_size) /* table full */ expand_table(table, table_size); strcpy((*table)[(*table_index)++].str, cp); } cp = cp2 + 1; } } fclose(nf); } /* see if a char is punctuation */ int is_punct(char ch, char *punct) { char *ptr; for (ptr = punct; *ptr; ptr++) if (*ptr == ch) return TRUE; return FALSE; } /* see if field is spaces */ int is_spaces(char *line, int pos, int size) { int index; if (strlen(line) < pos) return TRUE; for (index = 0; index < size; index++) if (!isspace(line[pos + index - 1])) return FALSE; return TRUE; } /* see if the word is noise */ int is_noise(char *word, int size, int minimum_word) { int ind; if (strlen(word) < minimum_word) /* simple heuristic saves lots of noise entries */ return TRUE; if ((!switch_present("numbers")) && isdigit(*word)) return TRUE; for(ind = 0; ind < size; ind++) { if (noise[ind].str == NULL) return FALSE; if (strcmp(noise[ind].str, word) == 0) return TRUE; } return FALSE; } /* see if the word is candidate */ int is_candidate(char *word, int size) { int ind; for(ind = 0; ind < size; ind++) { if (candidate[ind].str == NULL) return FALSE; if (strcmp(candidate[ind].str, word) == 0) return TRUE; } return FALSE; } /* see if we should exclude this file */ int exclude(char *file) { char *ptr, excl[100], file_tab[6][40], excl_tab[6][40]; int status, ind; struct fscndef file_list[6], excl_list[6], scan_list[6] = {{(short) 0, (short) FSCN$_DEVICE, (long) 0}, {(short) 0, (short) FSCN$_DIRECTORY, (long) 0}, {(short) 0, (short) FSCN$_NAME, (long) 0}, {(short) 0, (short) FSCN$_TYPE, (long) 0}, {(short) 0, (short) FSCN$_VERSION, (long) 0}, {(short) 0, (short) 0, (long) 0}}; make_valid("exclude"); memcpy(file_list, scan_list, sizeof(scan_list)); if (((status = sys$filescan(descr(file), file_list, 0)) & 1) != SS$_NORMAL) lib$stop(status); for (ind = 0; ind < 5; ind++) { strncpy(file_tab[ind], (char *) file_list[ind].fscn$l_addr, file_list[ind].fscn$w_length); file_tab[ind][file_list[ind].fscn$w_length] = '\0'; } while ((ptr = switch_value("exclude")) != NULL) { strcpy(excl, ptr); memcpy(excl_list, scan_list, sizeof(scan_list)); if (((status = sys$filescan(descr(excl), excl_list, 0)) & 1) != SS$_NORMAL) lib$stop(status); for (ind = 0; ind < 5; ind++) { if (excl_list[ind].fscn$w_length == 0) strcpy(excl_tab[ind], "*"); else { strncpy(excl_tab[ind], (char *) excl_list[ind].fscn$l_addr, excl_list[ind].fscn$w_length); excl_tab[ind][excl_list[ind].fscn$w_length] = '\0'; } if (str$match_wild(descr(file_tab[ind]), descr(excl_tab[ind])) == STR$_NOMATCH) break; } if (ind == 5) return TRUE; /* all fields match, exclude file */ } return FALSE; } /* make a temp lowercase copy of a string */ char *lc(char *str) { # define N_STRING 4 static char strings[N_STRING][MAX_INPUT_LINE]; static int cur_string = -1; char *cp; if (++cur_string >= N_STRING) cur_string = 0; for (cp = strings[cur_string];; cp++) { *cp = tolower(*str++); if (*cp == '\0') break; } return strings[cur_string]; } /* change a string to lower case */ void lower(char *str) { char *cp; if (str) for (cp = str; *cp; cp++) *cp = tolower(*cp); } /* find where the string starts (origin 1) in record */ int find_str(char *record, char *str) { char *cp, *lcr; if ((str == NULL) || (strlen(str) == 0)) return 0; /* zero means string not found */ lcr = lc(record); cp = strstr(lcr, lc(str)); if (cp == NULL) return 0; return (cp - lcr + 1); } /* descr() creates character descriptor and return the address of the descriptor to the caller. */ # define N_DESCR 10 static struct dsc$descriptor_s str_desc[N_DESCR]; static int cur_descr = -1; struct dsc$descriptor_s *descr(char *string) { if(++cur_descr >= N_DESCR) cur_descr = 0; str_desc[cur_descr].dsc$w_length=(short)strlen(string); str_desc[cur_descr].dsc$b_dtype=DSC$K_DTYPE_T; str_desc[cur_descr].dsc$b_class=DSC$K_CLASS_S; str_desc[cur_descr].dsc$a_pointer=string; return &str_desc[cur_descr]; } int get_decimal(char *ptr) { do ptr++; while ((*ptr != '=') /* skip to the keyword/parameter */ && (*ptr != ':')); /* separator character */ while (isspace(*++ptr)); /* skip spaces */ return (atoi(ptr) < 256) ? atoi(ptr) : 256; } char *get_text(char **dest, char *ptr) { char *start, *cp; do ptr++; while ((*ptr != '=') /* skip to the keyword/parameter */ && (*ptr != ':')); /* separator character */ while (isspace(*++ptr)); /* skip spaces */ if (*ptr == '"') { /* if quoted string */ start = ++ptr; /* skip over quote */ for (; *ptr; ptr++) { /* skip to ending quote */ if ((*ptr == '"') && (*(ptr+1) == '"')) { /* doubled quotes? */ ptr++; /* yes, skip it */ continue; } if (*ptr == '"') /* un-doubled quote? */ break; /* yes, stop here */ } } else { /* else non-quoted string */ start = ptr; /* start of string */ while (*ptr && (*ptr != ' ') && (*ptr != ',') && (*ptr != '/') && (*ptr != ')')) ptr++; /* skip to string terminator */ } *dest = (char *) calloc((ptr - start) + 1, sizeof(char)); check_alloc(*dest); strncpy(*dest, start, ptr - start); for (cp = *dest; *cp; *cp++) /* collapse doubled quotes to single */ if ((*cp == '"') && (*(cp+1) == '"')) strcpy(cp, cp+1); return ptr; } int get_file_type(char *ptr) { int status = 1; /* assume no qualifier */ do { ptr++; if (*ptr == '=') status = 2; /* only one qualifier */ } while (*ptr /* skip to next clause */ && (*ptr != ',') /* keyword */ && (*ptr != '/') /* switch */ && (*ptr != ')')); /* or parameter */ return status; } /* parse command line for /topic */ void parse_topic(char *line, topic_str *topics) { char *ptr, *start, **dest; static int index = -1; ptr = line; /* point to start of line */ for (;;) { /* search for /topic until end of line */ if (index == TOPIC_SIZE) return; /* exit if we can't hold any more */ ptr = strchr(ptr, '/'); /* search for switch start */ if (ptr == NULL) return; /* no more switches */ while (isspace(*++ptr)); /* skip spaces */ if (tolower(*ptr) != 't') /* topic is unique to one character */ continue; /* not /topic, keep scanning */ do ptr++; while (*ptr && (*ptr != '=') /* skip to the keyword/parameter */ && (*ptr != ':')); /* separator character */ if (!*ptr) return; /* ran out of command line */ while (isspace(*++ptr)); /* skip spaces */ if (*ptr == '(') /* if start of list */ while (isspace(*++ptr)); /* skip spaces */ index++; /* next topics structure */ topics[index].used = TRUE; /* this topic index is used */ topics[index].pos = DEFAULT_POS; /* default position */ topics[index].end = ""); parse_switch("selector.default", sw, 0, 0, 0); parse_switch("selector.both", sw, 0, 0, 0); parse_switch("selector.ignore", sw, 0, 0, 0); parse_switch("sequential", sw, 0, 0, 0); parse_switch("url", sw, 0, 0, 0); parse_switch("url.text", sw, 0, 0, 0); parse_switch("url.end", sw, 0, 1, "\">"); parse_switch("url.default", sw, 0, 0, 0); parse_switch("url.both", sw, 0, 0, 0); parse_switch("url.prefix", sw, 0, 0, 0); parse_switch("url.fragment", sw, 0, 0, 0); parse_switch("version", sw, 0, 0, 0); parse_switch("whole", sw, 0, 0, 0); parse_switch("word_length", sw, 0, 1, "20"); if (switch_present("keyword")) { keyword_index = -1; parse_keyword(input->dsc$a_pointer, keywords); /* fill keywords[] */ } parse_topic(input->dsc$a_pointer, topics); /* parse the command line and fill topics */ } int switch_present(char *name) { int ind; for (ind = 0; switches[ind].name; ind++) if (strcmp(name, switches[ind].name) == 0) break; if (switches[ind].name == NULL) return FALSE; return switches[ind].state; } char *switch_value(char *name) { int ind; for (ind = 0; switches[ind].name; ind++) if ((strcmp(name, switches[ind].name) == 0) && (switches[ind].multiple < 2)) break; if (switches[ind].multiple == 1) switches[ind].multiple = 2; /* invalidate current entry */ return switches[ind].value; } void make_valid(char *name) { int ind; for (ind = 0; switches[ind].name; ind++) if ((strcmp(name, switches[ind].name) == 0) && (switches[ind].multiple == 2)) switches[ind].multiple = 1; } void *my_realloc(void *mem, int size) { void *ptr; if (mem == (void *) 0) { ptr = (void *) malloc(size); check_alloc(ptr); return ptr; } else { ptr = (void *) realloc(mem, size); check_alloc(ptr); return ptr; } } /* sanity check memory allocation calls */ void check_alloc(void *ptr) { int status = SS$_INSFMEM; if (ptr == (void *) 0) { lib$stop(status); } }