#!/bin/perl # # search - front end to swish-e for the LHSC web # # Copyright (C) 1999 Steve van der Burg # # This program is licensed under the same terms as described in the Artistic # License that is included with perl 5 and later. # # Revision history: # # When Who What # --------- --- --------------------------------------------------------------- # Jan 18/99 sjv Created. # Jun 2/99 sjv Fixed blank "also search" entry for bad filter spec. # Jul 21/99 sjv Changed layout a bit. # Aug 17/99 sjv Added "result pages" options. # Aug 31/99 sjv Added support for document abstracts (added by custom spider). # Sep 9/99 sjv Pipe-open to swish replaced with fork-open + exec to avoid # shell argument expansion. # Sep 15/99 sjv Removed site-specific stuff for more sensible distribution. # Sep 29/99 sjv url_param() call replaced with param() -- we're no longer # mixing POST and GET, which was always slightly evil anyway. # Aug 14/00 sjv Fixed the tie_gdbm call. Thanks to Matt Macdonald. # use CGI qw(:all); use Fcntl; use GDBM_File; use strict; my $trouble = 0; # set to 1 if searches are down. my %filters = ( default => { URL => '', text => 'the entire site', image => '', # replace with URL for logo, for example. }, Private => { URL => 'priv/', text => 'Private parts of the site', image => '', }, Tiny => { URL => 'a/very/small/part/of/the/site/', text => 'A tiny part of the site', image => '', }, ); $CGI::POST_MAX = 16 * 1024; my $query = new CGI; $query->import_names('F'); my $filter = param('filter'); my $filt = $filter if $filter && exists $filters{$filter}->{URL}; $filt ||= 'default'; my $title = 'Search '.$filters{$filt}->{text}; my $this_cgi = url(); $this_cgi .= "?filter=$filt" unless $filt eq 'default'; my $swish_e = '/opt/lhsc/www/swish/swish-e'; my $sw_config = '/www/database/swish-e/http.conf'; my $sw_index = '/www/database/swish-e/lhsc.index'; my $max_results = 1000; my $psize = 20; $| = 1; print header; print start_html(-title=>$title.($F::sch_spec ? ' - Results' : ''), -bgcolor=>'#ffffff'); print table({-cellpadding=>0,-cellspacing=>0,-border=>0,-width=>'100%'}, Tr( td({-valign=>'bottom'},i(font({-size=>'+3'},$title))), ( $filters{$filt}->{image} ? td(img({-align=>'right',-border=>0, -src=>$filters{$filt}->{image}})) : '' ), ), ); if ( $F::sch_spec ) { get_results(); } else { get_spec(); } print p(), end_html(); sub search_alts { my @alts; foreach ( sort { lc($a) cmp lc($b) } keys %filters ) { next if $_ eq $filt; next unless $filters{$_}->{text}; my $this_cgi = url(); $this_cgi .= "?filter=$_" unless $_ eq 'default'; push @alts,a({-href=>$this_cgi},$filters{$_}->{text}); } return @alts; } sub search_box { my ($slab,$default) = @_; return startform(-action=>$this_cgi,-method=>"GET"), $filt ne 'default' ? hidden(-name=>"filter",-value=>$filt,-override=>1) : '', hidden(-name=>"start",-value=>1,-override=>1), hidden(-name=>"end",-value=>$psize,-override=>1), center( table({-cellspacing=>0,-cellpadding=>9,-border=>1,-bgcolor=>'#cccccc'}, Tr({-align=>'center'}, td( textfield(-name => 'sch_spec', -default => $default, -size => 25, -maxlength => 99). '  '. submit(-value=>$slab), $default ? (' ' x 2).' or '.(' ' x 2). a({-href=>$this_cgi},"Start a new search") : () ), ), ), ), endform(); } sub get_spec { my @alts = search_alts; my $alt_msg = Tr(td({-valign=>'top'},"Or restrict",br(),"your search to:"), td( font({-size=>"-1"},ul(li(\@alts))) )) if (scalar @alts) > 0; print table({-cellpadding=>8}, Tr({-valign=>"top",}, td({-bgcolor=>"#ffffcc"}, $trouble ? center(font({-size=>"+2"}, b("Local web searching is currently unavailable.",br(), "Please try again later"))). p() : search_box('Search'), center( table( $trouble ? () : $alt_msg, Tr(td({-colspan=>2,-align=>'center'}, "You can also ", a({-href=>'/search.htm'},"search the Internet") ) ) ), ), ), td( font({-size=>'-1'}, b("Search help:"), br(), "Documents containing ",b("all")," of the words entered here will be ", "displayed. Searches are not case-sensitive.", p(), "The search engine matches search terms (words) using a ", a({-href=>"http://sunsite.berkeley.edu/SWISH-E/Manual/stemming.html"}, "stemming algorithm"), " that makes words like ", b("book").", ".b("books").", ".b("booked").", and ".b("booking"), " look the same (ie. return the same search results). ", "To search for partial words, use an asterisk (*). ", "For example, enter ",b("sal*")," to find ", i("salt")," and ",i("sale."), "You can use the words AND, OR and NOT to create more ", "complex queries like ", b("drug AND interac*")," & ",b("blood OR donor").". ", "Parentheses can also be used to group terms: eg. ", b("counsel* AND (genetic or grief)").". ", "Searches powered by ", a({-href=>'http://sunsite.berkeley.edu/SWISH-E/'},"SWISH-e").".", ), ))); } sub get_results { my $clean_spec = $F::sch_spec; my @results; if ( open(SWISH,"-|") ) { while () { next unless /^\d/; chomp; my ($rank,$URL,$title,$length) = split(/"/,$_); $title = '(no title)' unless $title; if ( $filt ne 'default' ) { next unless $URL =~ m#$ENV{WEBHOST}/$filters{$filt}->{URL}#o; } push @results,[ $URL, a({-href=>$URL},$title) ]; } close SWISH; } else { # This is the child that feeds the parent (above): exec $swish_e,'-w',$clean_spec, '-m',$max_results, '-f',$sw_index, '-d','dq'; die "Could not exec swish: $!"; } my $total = $#results + 1; my $start = param("start"); $start = 1 if $start eq 'all' || $start < 1; my $end = param("end"); $end = $#results + 1 if $end eq 'all' || ! $end; $end = $total if $end > $total; $start = $end if $start > $end; @results = splice(@results,$start-1,$end-$start+1) if $total && $end-$start < $#results; # Look up abstract (first mumble bytes of each page): # my $abst = '/www/database/swish-e/abstract.gdbm'; my $resno = $start; if ( tie_gdbm($abst,\my %adb,'read',5,'none') ) { foreach ( @results ) { push @{$_},$resno++,$adb{$_->[0]}; } untie %adb; } my @pages; if ( $total > $psize ) { my $this_cgi = url()."?sch_spec=".CGI::escape($clean_spec); $this_cgi .= "&filter=$filt" unless $filt eq 'default'; my ($prev,$next,$currpg); foreach ( 1..100 ) { my ($pst,$pend) = make_range($_,$total); $currpg = $_ if $pst == $start && $pend == $end; push @pages,$pst == $start && $pend == $end ? b($_) : a({-href=>$this_cgi."&start=$pst&end=$pend"},$_) if $_ < 16; last if $pend == $total; } if ( $currpg > 1 ) { my ($pst,$pend) = make_range($currpg-1,$total); $prev = a({-href=>$this_cgi."&start=$pst&end=$pend"}, '['.b("<< Prev").']'); } if ( $end < $total ) { my ($pst,$pend) = make_range($currpg+1,$total); $next = a({-href=>$this_cgi."&start=$pst&end=$pend"}, '['.b("Next >>").']'); } push @pages,a({-href=>$this_cgi."&start=1&end=$total"},"all"); push @pages,$next if $next; unshift @pages,$prev if $prev; } print search_box('Refine Search',$clean_spec), b($total ? "$start to $end of $total" : "No","matches on ". "search for \"$clean_spec\""); print br(), b("Result pages: "), font({-size=>"-1",-face=>"Arial"},join('  ',@pages)) if @pages; print p(), map { dl(b(dt($_->[2].'. '.$_->[1])).dd($_->[3])) } @results; print b("Result pages: "), font({-size=>"-1",-face=>"Arial"},join('  ',@pages)) if @pages; print p(), search_box('Refine Search',$clean_spec) if scalar @results > 40; } sub make_range { my ($pno,$total) = @_; my $pst = $pno * $psize - $psize + 1; my $pend = $pno * $psize > $total ? $total : $pno * $psize; return $pst,$pend; } sub tie_gdbm { my ($gdbm_db,$hashref,$tiefunc,$time_to_try) = @_; my $i = 0; my $tieres; my $db_mode; my $tres; $db_mode = ($tiefunc eq 'read') ? GDBM_READER : GDBM_WRCREAT; $time_to_try = 0 if $time_to_try < 0; while ($i < $time_to_try) { tie (%$hashref, 'GDBM_File', $gdbm_db, $db_mode, 0640); $tieres = $!; $tres = tied(%$hashref); if ( defined($tres) ) { last; } else { sleep 1; } $i++; } if ( defined($tres) ) { undef $tres; return 1; } else { return undef; } }