#!/usr/local/bin/perl ###################################################################### # # Program: Sethi Search Utility # A simple command line utility to submit a search to # Google, Yahoo, or M$N and show the (text) results # # Version: 0.8.7 # Download from http://www.sethi.org/tools/ # # Output: Sethi Search automagickally: # - runs the search on the specified engine # - creates a /tmp/engine.config if not specified # - displays the results # # License: # GPL & Postcard-Ware! If you like this program and # use it, drop me a postcard or an email or just sign # our guestbook and tell me how great I am. :) And # hey, if you *really* like it, don't worry about # designing that shrine dedicated to my greatness... a # simple link back to our homepage should suffice. # Disclaimer: please excuse the messy code... this was # a quick & dirty kludge done in a hurry. # # Credits: # Based on my grab-cnn-news.pl # # Author: # Ricky J. Sethi (rickys@sethi.org) # http://www.sethi.org/ # # Changes: # This is the compact version which includes an # abbreviated engine.config file for Google, Yahoo, and # M$N directly in the script itself. However, since # the engines tend to change their formats frequently, it # probably needs quite a bit of maintenance/updating. # LATEST CHANGE: Updated Google's HitStart in config # # 2006-05-21: Updated Google's config (since that's all # I use; apologies to any real programmers out there; I first created # this little hack about five years ago, before I knew enough to use # good software design and OOP principles; it's in dire need of a # complete re-write, especially to avoid screen-scraping and be a good # engine using their search APIs) # # 2007-04-03: Updated Google's config again. Let me # reiterate once again that I'm not really maintaining # this and that it's in DIRE need of complete re-write # using good OOP principles and the APIs instead of # screen-scraping. However, I'm too busy/lazy and this was an easy hack... :) # # TODO: # * Include my extended engine.config file (it contains # about 8 engines)? # # Usage: # search [-e google] [-h 20] [-C /path/to/configFile] 'search_term' # where: # -e: Name of the Search Engine # -h: Number of Hits to return # -C: Custom Configuration File # ###################################################################### ###################################################################### # # Customizations -- make changes below # (No changes needed below here) # ###################################################################### ### Set Default Config File Location (overwritten on EACH run/call): $default_config = "/tmp/sethi_search_default.config"; ###################################################################### ############### MAIN ############### # Simple script to submit a search to Google and show results # Takes argument that says how many hits to limit to # Usage: search [-e google] [-h 20] [-C /path/to/configFile] 'search_term' ### Standard Perl Libraries I need use Config; use LWP::Simple; use LWP::RobotUA; use Getopt::Std; use URI::Escape; ### Debug? $DEBUG = 0; ### Get the options: getopt('ehC'); unless ($opt_h =~ m/[0-9]+/) {$opt_h = 10;} $opt_e = lc($opt_e); unless ($opt_e =~ m/[a-z]+/) {$opt_e = "google";} if (lc($opt_C) =~ m/[a-z]+/) { $configFile = $opt_C; } else { ### Create Default engine.config &CreateDefaultConfig(); $configFile = $default_config; } ### Get search term: $search = shift; ### Read the Engines configuration file into %searchEngines &ReadEngines(); ### Okay to proceed? $usage = "Usage: search [-e eng_name] [-h num_lines] [-C config_file_path] 'search_term'"; unless ($search) {die "$usage\n";} unless ($enginesToDo{$opt_e}) { print "Engine must be one of the following:\n"; foreach $eng (sort keys %enginesToDo) { print ucfirst($eng) . " \n"; } #print "\n"; exit; } if ($DEBUG) {print "Searching $opt_e for $search and returning $opt_h lines\nusing $configFile\n";} ### Get the data: $urldata = ParseUrlData($opt_e); if ($DEBUG) {print "Searched $opt_e for $search and found $urldata\n";} print "$urldata"; ############### Get Search Results ############### # Return parsed & processed Search Results # Only do single engine for now (allow multiple engines later) # sub ParseUrlData ($engine_names) { # Get URL for engine we're doing here: my $engineName = shift; # Encode the search string first my $term = uri_escape($search); # Set initial url: $url = $searchEngines{$engineName . 'SearchEngineURL'} . $term; # Get document using Simple method: my $doc = ""; my $results = "\t*****************************************************************\n\n"; my $rank = 0; # Set start and end stuff my $itVariable = $searchEngines{$engineName . "PageCounterVariable"}; my $itStarter = $searchEngines{$engineName . "PageCounterStarter"}; my $hsep = $searchEngines{$engineName . "HitSeparator"}; my $hsepend = $searchEngines{$engineName . "HitSeparatorEnd"}; my $hsepstart = $searchEngines{$engineName . "HitSeparatorStart"}; my $hstart = $searchEngines{$engineName . "HitStart"}; my $hend = $searchEngines{$engineName . "HitEnd"}; my $hitsPerPage = $searchEngines{$engineName . "NumberHitsPerPage"}; my $startNumber = $searchEngines{$engineName . "PageCounterStarter"}; my $incrementer = $searchEngines{$engineName . "PageCounterIncrementer"}; # Get required number of pages: # Couldn't just use a NumberOfHitsWanted variable since all # engines (notably Altavista) don't support it #for (my $x = $startNumber; $x < $opt_h + $startNumber; $x += $incrementer) { for (my $x = 0; $x*$hitsPerPage < $opt_h; $x += 1) { # Needed to do this so that we could use full regex in PageCounterVariable (mainly for M$'s MSN) my $foo = $x * $hitsPerPage; $url =~ s%($itVariable)$itStarter%$1$foo%; if ($DEBUG) {print $url . "\n";} ### Google doesn't like the simple get, the friggin schmucks! #$doc = get($url); # Create a new user agent object that doesn't redirect and uses cookies: $ua = new LWP::UserAgent; $ua->agent("Sethi Search Agent"); $ua->timeout (300); $req = new HTTP::Request GET => $url; $req->content_type ('text/html'); my ($res) = $ua->request ($req); # Successful? if ($res->is_success) { $doc = $res->content; } else { print "Found Errors here: " . $res->status_line; } if ($DEBUG > 3) {print $doc;} ### Parse the link my @searchlines = split(/$hsep/, $doc); if ($DEBUG > 1) {print "For $hsep, Got number of lines: ". $#searchlines ."\n";} foreach $line (@searchlines) { if ($DEBUG > 3) {print "\n\nGot line: $line\n";} # If the block contains hitstart and hitend, save it if ( ($line =~ m%$hstart%) && ($line =~ m%$hend%) ) { $rank++; if ($DEBUG > 1) {print "With HitStart: $hstart and HitEnd: $hend, got Rank: $rank for line: $line\n";} # If already found as many as we need, break out: if ($rank > $opt_h) {$rank--; last;} # If first one, hose any initial stuff: if ( ($rank - 1)% $hitsPerPage eq 0 ) { $line =~ s%^.*?($hsepstart.*)%$1%s; if ($DEBUG > 1) {print "For FIRST one with HitSeparatorStart: $hsepstart, hosing $1 \n\n\n from $line\n";} } # If last one, let's hose any trailing info from page, if necessary: if ( ($rank eq $hitsPerPage) && ($hsepend) ) { $line =~ s%(.*?$hsepend).*%$1%s; } # Save URL info: $line =~ m%$hstart.*?(http://.*?)("?>|$hend)%i; #$line =~ m%$hstart.*?(http://.*?)"?>%i; my $saveurl = $1; # Hose all other HTML tags: $line =~ s%<.*?>%%g; $line =~ s% % %g; $line =~ s%"%"%g; $line =~ s%•%*%g; # Hose consecutive newlines: $line =~ s%(\n\s*)+%\n%g; # Output final result: if ($searchEngines{$engineName . 'PositionIdentifierRegex'}) { $results .= $line . "\n\t[URL: $saveurl]\n\n"; } else { $results .= "$rank. " . $line . "\n\t[URL: $saveurl]\n\n"; } } } } $results .= "\n***** Searched " . ucfirst($opt_e) . " for '$search' and returned $rank hits.\n"; return $results; } ############### Subroutine for Reading Engines File ############### # Adapted from lonelyeye.pl # Only uses SearchEngineURL, HitSeparator, HitStart, and HitEnd from # the engine.config file sub ReadEngines () { unless ( open (CONFIGFILE, "$configFile") ) { $errorMsg = "Couldn't open Engines' Configuration File $configFile.\n"; die "$errorMsg\n"; } ### Build up SearchEngines Hash while ($temp = ) { chomp ($temp); if ( ($temp eq "") || ($temp =~ m%^\#%) ) { next; } $separator = $temp; if ($separator !~ m%^-{25}%) { # Fatal Error Message $errorMsg = "Engines' Configuration File has the wrong format since I got $separator.\n"; die "$errorMsg\n"; } my %variables; while ($temp = ) { chomp ($temp); # Skip blank lines and comments if ( ($temp eq "") || ($temp =~ m%^\#%) ) { next; } my ($key, $value) = split(/:/,$temp,2); # Hose leading whitespace: $value =~ s/^\s*//; $variable{$key} = $value; if ($key =~ m%ENDengineDEFINITION%) {last;} } # lowercase searchEngine name and store it: $searchEngine = lc($variable{'SearchEngineName'}); ### Add name of this engine to second hash (%enginesToDo) $enginesToDo{$searchEngine} = $searchEngine; ### Now build the %searchEngines hash: foreach $key (sort keys(%variable)) { $searchEngines{$searchEngine . $key} = $variable{$key}; } } close (CONFIGFILE); } ############### Subroutine for Creating Default engine.config ############### # Just setup Google, Yahoo, and MSN # # sub CreateDefaultConfig () { open (CONFIGFILE, ">$default_config"); print CONFIGFILE <<'ENDOFCONFIG'; #--------------------------------------------- # TODO: # - Add a KillBefore and KillAfter where everything before the # KillBefore regex is dropped from the $doc and everything after the # KillAfter regex is dropped from the $doc. # # # # SearchEngineName: -- String: One word name of engine (serves as key in engines hash) # SearchEngineNote: -- String: Any additional notes (displayed on results page only) # SearchEngineURL: -- String: Special format; if it uses GET, last field should be search field # SearchEngineLogo: -- String: Simple URL of logo # RegExNoMatch: -- Regex: If regexnomatchState is +, this is the phrase that says "No results found"; if -, says "Found 32 matches" # SearchType: -- String: GET or POST # PostSubmitContent: -- String: If POST method used, this contains the variables (last one should be search field -- see SearchEngineURL) # TotalFoundRegEx: -- Regex: E.g, "Google results .*? of about (.*) for " # LimitPages: -- Numeric: Should we limit search to a certain number of pages (instead of the maximum number of pages)? # NumberHitsPerPage: -- Numeric: How many hits are returned to a page? # PositionIdentifierRegex: -- Regex: Does the engine number the returned hits? # HitSeparator:

-- String: Splits document on the HitSeparator; if no numbering provided by engine, this lets us count it ourselves # HitStart: -- Regex: Start of hit # HitEnd: -- Regex: End of hit # PageCounterVariable: -- String: Which variable in the SearchEngineURL or PostSubmitContent controls which hit number of results to start with # PageCounterIncrementer: -- Numeric: By how many hits should we increase each time (usually this is the NumberHitsPerPage) # PageCounterStarter: -- Numberic: What number does the count start with? Usually 0 or 1 # ENDengineDEFINITION # # General Tips for Regexes: # 1. Make sure you use NON-GREEDY versions (e.g., \s*? instead of \s*) # 2. Remember that the magic switches %ms are set so . matches # newlines across WHOLE document (i.e., ^ and $ don't stop at \n; # see p.233 of Mastering Regular Expressions # 3. Test regexes by turning on $DEBUG; make sure you individually # test EACH engine's different regexes (use the engine.config.test # file)! # # @NOTE: In order to figure out correct regex patterns below, turn on DEBUG'ing and look at generated page as View Source in the Browser looks at JavaScript interpreted page! # --------------------------------------------- SearchEngineName: google SearchEngineNote: SearchEngineURL: http://www.google.com/search?sourceid=sethi&hl=en&ie=UTF-8&oe=UTF-8&start=0&q= SearchEngineLogo: http://www.google.com/logos/Logo_40gry.gif RegExNoMatchState: Positive RegExNoMatch: did not match any documents in this database SearchType: Get PostSubmitContent: TotalFoundRegEx: Results .*? - .*? of about (.*) LimitPages: NumberHitsPerPage: 10 PositionIdentifierRegex: # Not sure why the works... but it breaks the page into chunks # on the and then searches for hitStart and hitEnd to qualify # whether or not that chunk contains a legitimate hit or not... # 2006-05-21: Apparently, the comments are parsed out by default with LWP #HitSeparator: #HitSeparator: HitSeparator: # Added in for Google on 2007-04-03: HitSeparatorStart:

HitSeparatorEnd: #HitStart:

PageCounterVariable: start= PageCounterIncrementer: 10 PageCounterStarter: 0 ENDengineDEFINITION --------------------------------------------- # For Yahoo, check to see if it says "Go To Web Page Matches" or "Next # __ Matches" which means that it found the search term on local # yahoo; so if it's successful like that, simply return the page # number it's on. If it says "Web Pages (1-2 of 2)" or "Sorry, no # matches were found containing" then no Yahoo match. # # Do a Yahoo!Web search separately (treat as different engine). # # Web Pages search: # http://ink.yahoo.com/bin/query?p=foo&hc=2&hs=24 # Yahoo Categories search: # http://search.yahoo.com/search?p=foo&hc=2&hs=24&h=c # Yahoo Sites search: # http://search.yahoo.com/search?p=foo&hc=2&hs=24&h=s # Yahoo General search: # http://search.yahoo.com/search?p=foo # # For Yahoo! regular (i.e., not web pages) search, ONLY search sites # since categories can't contain corroborative URL!!! # SearchEngineName: yahoo SearchEngineNote: SearchEngineURL: http://search.yahoo.com/bin/search?b=1&h=s&p= SearchEngineLogo: http://a1.g.a.yimg.com/7/1/31/000/us.yimg.com/i/yahoo.gif RegExNoMatchState: Positive RegExNoMatch: Search\s*?Result.*?Found.*\s*?web pages\s*?for SearchType: Get PostSubmitContent: TotalFoundRegEx: Found .*?\s*categories\s*and (.*?)\s*sites\s*for LimitPages: NumberHitsPerPage: 20 PositionIdentifierRegex: #HitSeparator: \n HitSeparator:

  • HitSeparatorStart: HitSeparatorEnd: HitStart: \s* LimitPages: NumberHitsPerPage: 20 PositionIdentifierRegex: HitSeparator: HitSeparatorStart: HitSeparatorEnd: HitStart: