#!/usr/local/bin/perl


######################################################################
# 
#      Program: Sethi Search Utility
#               A simple command line utility to submit a search to
#               Google, Yahoo, or M$N and show the (text) results
# 
#      Version: 0.8.7
#               Download from http://www.sethi.org/tools/
# 
#      Output:  Sethi Search automagickally:
#               - runs the search on the specified engine
#               - creates a /tmp/engine.config if not specified
#               - displays the results
# 
#      License:
#               GPL & Postcard-Ware!  If you like this program and
#               use it, drop me a postcard or an email or just sign
#               our guestbook and tell me how great I am. :)  And
#               hey, if you *really* like it, don't worry about
#               designing that shrine dedicated to my greatness... a
#               simple link back to our homepage should suffice.
#               Disclaimer: please excuse the messy code... this was
#               a quick & dirty kludge done in a hurry.
# 
#      Credits:
#               Based on my grab-cnn-news.pl
# 
#      Author:
#               Ricky J. Sethi (rickys@sethi.org)
#               http://www.sethi.org/
# 
#      Changes:
#               This is the compact version which includes an
#               abbreviated engine.config file for Google, Yahoo, and
#               M$N directly in the script itself.  However, since
#               the engines tend to change their formats frequently, it
#               probably needs quite a bit of maintenance/updating.
#               LATEST CHANGE:  Updated Google's HitStart in config
# 
#               2006-05-21:  Updated Google's config (since that's all
#               I use; apologies to any real programmers out there; I first created
#               this little hack about five years ago, before I knew enough to use
#               good software design and OOP principles; it's in dire need of a
#               complete re-write, especially to avoid screen-scraping and be a good
#               engine using their search APIs) 
# 
#               2007-04-03:  Updated Google's config again.  Let me
#               reiterate once again that I'm not really maintaining
#               this and that it's in DIRE need of complete re-write
#               using good OOP principles and the APIs instead of
#               screen-scraping.  However, I'm too busy/lazy and this was an easy hack... :)
# 
#      TODO:
#               * Include my extended engine.config file (it contains
#                 about 8 engines)?
# 
#      Usage:
#               search [-e google] [-h 20] [-C /path/to/configFile] 'search_term'
#               where:
#               -e: Name of the Search Engine
#               -h: Number of Hits to return
#               -C: Custom Configuration File
# 
######################################################################


######################################################################
#
# Customizations -- make changes below
# (No changes needed below here)
# 
######################################################################

### Set Default Config File Location (overwritten on EACH run/call):
$default_config = "/tmp/sethi_search_default.config";
######################################################################


############### MAIN ###############
# Simple script to submit a search to Google and show results
# Takes argument that says how many hits to limit to
# Usage: search [-e google] [-h 20] [-C /path/to/configFile] 'search_term'


### Standard Perl Libraries I need
use Config;
use LWP::Simple;
use LWP::RobotUA;
use Getopt::Std;
use URI::Escape;


### Debug?
$DEBUG = 0;


### Get the options:
getopt('ehC');
unless ($opt_h =~ m/[0-9]+/) {$opt_h = 10;}
$opt_e = lc($opt_e);
unless ($opt_e =~ m/[a-z]+/) {$opt_e = "google";}
if (lc($opt_C) =~ m/[a-z]+/) {
    $configFile = $opt_C;
} else {
    ### Create Default engine.config
    &CreateDefaultConfig();
    $configFile = $default_config;
}


### Get search term:
$search = shift;


### Read the Engines configuration file into %searchEngines
&ReadEngines();


### Okay to proceed?
$usage = "Usage: search [-e eng_name] [-h num_lines] [-C config_file_path] 'search_term'";
unless ($search) {die "$usage\n";}
unless ($enginesToDo{$opt_e}) {
    print "Engine must be one of the following:\n";
    foreach $eng (sort keys %enginesToDo) {
	print ucfirst($eng) . " \n";
    }
    #print "\n";
    exit;
}


if ($DEBUG) {print "Searching $opt_e for $search and returning $opt_h lines\nusing $configFile\n";}


### Get the data:
$urldata = ParseUrlData($opt_e);
if ($DEBUG) {print "Searched $opt_e for $search and found $urldata\n";}
print "$urldata";


############### Get Search Results ###############
# Return parsed & processed Search Results
# Only do single engine for now (allow multiple engines later)
# 
sub ParseUrlData ($engine_names) {

    # Get URL for engine we're doing here:
    my $engineName = shift;

    # Encode the search string first
    my $term = uri_escape($search);
    # Set initial url:
    $url = $searchEngines{$engineName . 'SearchEngineURL'} . $term;
    # Get document using Simple method:
    my $doc = "";
    my $results = "\t*****************************************************************\n\n";
    my $rank = 0;
    

    # Set start and end stuff
    my $itVariable = $searchEngines{$engineName . "PageCounterVariable"};
    my $itStarter = $searchEngines{$engineName . "PageCounterStarter"};
    my $hsep = $searchEngines{$engineName . "HitSeparator"};
    my $hsepend = $searchEngines{$engineName . "HitSeparatorEnd"};
    my $hsepstart = $searchEngines{$engineName . "HitSeparatorStart"};
    my $hstart = $searchEngines{$engineName . "HitStart"};
    my $hend = $searchEngines{$engineName . "HitEnd"};
    my $hitsPerPage = $searchEngines{$engineName . "NumberHitsPerPage"};
    my $startNumber = $searchEngines{$engineName . "PageCounterStarter"};
    my $incrementer = $searchEngines{$engineName . "PageCounterIncrementer"};


    # Get required number of pages:
    # Couldn't just use a NumberOfHitsWanted variable since all
    # engines (notably Altavista) don't support it
    #for (my $x = $startNumber; $x < $opt_h + $startNumber; $x += $incrementer) { 
    for (my $x = 0; $x*$hitsPerPage < $opt_h; $x += 1) { 
	# Needed to do this so that we could use full regex in PageCounterVariable (mainly for M$'s MSN)
	my $foo = $x * $hitsPerPage;
	$url =~ s%($itVariable)$itStarter%$1$foo%;
	if ($DEBUG) {print $url . "\n";}

	### Google doesn't like the simple get, the friggin schmucks!
	#$doc = get($url);
	# Create a new user agent object that doesn't redirect and uses cookies:
	$ua = new LWP::UserAgent;
	$ua->agent("Sethi Search Agent");
	$ua->timeout (300);
	$req = new HTTP::Request GET => $url;
	$req->content_type ('text/html');
	my ($res) = $ua->request ($req);
	# Successful?
	if ($res->is_success) {
	    $doc = $res->content;
	} else {
	    print "Found Errors here:  " . $res->status_line;
	}

	if ($DEBUG > 3) {print $doc;}


	### Parse the link
	my @searchlines = split(/$hsep/, $doc);
	if ($DEBUG > 1) {print "For $hsep, Got number of lines: ". $#searchlines ."\n";}

	foreach $line (@searchlines) {
	if ($DEBUG > 3) {print "\n\nGot line: $line\n";}
	    # If the block contains hitstart and hitend, save it
	    if ( ($line =~ m%$hstart%) &&  ($line =~ m%$hend%) ) {
		$rank++;
		if ($DEBUG > 1) {print "With HitStart: $hstart and HitEnd: $hend, got Rank: $rank for line: $line\n";}
		# If already found as many as we need, break out:
		if ($rank > $opt_h) {$rank--; last;}

		# If first one, hose any initial stuff:
		if ( ($rank - 1)% $hitsPerPage eq 0 ) {
		    $line =~ s%^.*?($hsepstart.*)%$1%s;
		    if ($DEBUG > 1) {print "For FIRST one with HitSeparatorStart: $hsepstart, hosing $1 \n\n\n from $line\n";}
		}
		# If last one, let's hose any trailing info from page, if necessary:
		if ( ($rank eq $hitsPerPage) && ($hsepend) ) {
		    $line =~ s%(.*?$hsepend).*%$1%s;
		}
		# Save URL info:
		$line =~ m%$hstart.*?(http://.*?)("?>|$hend)%i;
		#$line =~ m%$hstart.*?(http://.*?)"?>%i;
		my $saveurl = $1;
		# Hose all other HTML tags:
		$line =~ s%<.*?>%%g;
		$line =~ s%&nbsp;% %g;
		$line =~ s%&quot;%"%g;
		$line =~ s%&#149;%*%g;
		# Hose consecutive newlines:
		$line =~ s%(\n\s*)+%\n%g;
		# Output final result:
		if ($searchEngines{$engineName . 'PositionIdentifierRegex'}) {
		    $results .= $line . "\n\t[URL:  $saveurl]\n\n";
		} else {
		    $results .= "$rank.  " . $line . "\n\t[URL:  $saveurl]\n\n";
		}
	    }
	}
    }


    $results .= "\n***** Searched " . ucfirst($opt_e) . " for '$search' and returned $rank hits.\n";
    return $results;
}


############### Subroutine for Reading Engines File ###############
# Adapted from lonelyeye.pl
# Only uses SearchEngineURL, HitSeparator, HitStart, and HitEnd from
# the engine.config file

sub ReadEngines () {
   
    unless ( open (CONFIGFILE, "$configFile") ) {
	$errorMsg = "Couldn't open Engines' Configuration File $configFile.\n";
	die "$errorMsg\n";
    }


    ### Build up SearchEngines Hash
    while ($temp = <CONFIGFILE>) {
	chomp ($temp);
	
	if ( ($temp eq "") || ($temp =~ m%^\#%) ) {
	    next; 
	}

	$separator = $temp;
	    

	if ($separator !~ m%^-{25}%) {
	    # Fatal Error Message
	    $errorMsg = "Engines' Configuration File has the wrong format since I got $separator.\n";
	    die "$errorMsg\n";
	}


	my %variables;
	while ($temp = <CONFIGFILE>) {
	    chomp ($temp);
	    # Skip blank lines and comments
	    if ( ($temp eq "") || ($temp =~ m%^\#%) ) {
		next; 
	    }
	    my ($key, $value) = split(/:/,$temp,2);
	    # Hose leading whitespace:
	    $value =~ s/^\s*//;
	    $variable{$key} = $value;
	    if ($key =~ m%ENDengineDEFINITION%) {last;}
	}

	# lowercase searchEngine name and store it:
	$searchEngine = lc($variable{'SearchEngineName'}); 

	### Add name of this engine to second hash (%enginesToDo)
	$enginesToDo{$searchEngine} = $searchEngine;

	
	### Now build the %searchEngines hash:
	foreach $key (sort keys(%variable)) {
	    $searchEngines{$searchEngine . $key} = $variable{$key};
	}

    }

    close (CONFIGFILE);
}


############### Subroutine for Creating Default engine.config ###############
# Just setup Google, Yahoo, and MSN
# 
# 

sub CreateDefaultConfig () {
   
    open (CONFIGFILE, ">$default_config");
    print CONFIGFILE <<'ENDOFCONFIG';
#---------------------------------------------
# TODO:
# - Add a KillBefore and KillAfter where everything before the
#   KillBefore regex is dropped from the $doc and everything after the
#   KillAfter regex is dropped from the $doc.
# 
# 
# 
# SearchEngineName:		-- String: One word name of engine (serves as key in engines hash)
# SearchEngineNote:		-- String: Any additional notes (displayed on results page only)
# SearchEngineURL:		-- String: Special format; if it uses GET, last field should be search field
# SearchEngineLogo:		-- String: Simple URL of logo
# RegExNoMatch:			-- Regex: If regexnomatchState is +, this is the phrase that says "No results found"; if -, says "Found 32 matches"
# SearchType:			-- String: GET or POST
# PostSubmitContent:		-- String: If POST method used, this contains the variables (last one should be search field -- see SearchEngineURL)
# TotalFoundRegEx:		-- Regex: E.g, "Google results <b>.*?</b> of about <b>(.*)</b> for <b>"
# LimitPages:			-- Numeric: Should we limit search to a certain number of pages (instead of the maximum number of pages)?
# NumberHitsPerPage:		-- Numeric: How many hits are returned to a page?
# PositionIdentifierRegex:	-- Regex: Does the engine number the returned hits?
# HitSeparator: </li><p>	-- String: Splits document on the HitSeparator; if no numbering provided by engine, this lets us count it ourselves
# HitStart:			-- Regex: Start of hit
# HitEnd:			-- Regex: End of hit
# PageCounterVariable:		-- String: Which variable in the SearchEngineURL or PostSubmitContent controls which hit number of results to start with
# PageCounterIncrementer:	-- Numeric: By how many hits should we increase each time (usually this is the NumberHitsPerPage)
# PageCounterStarter:		-- Numberic: What number does the count start with?  Usually 0 or 1
# ENDengineDEFINITION
# 
# General Tips for Regexes:  
# 1. Make sure you use NON-GREEDY versions (e.g., \s*? instead of \s*)
# 2. Remember that the magic switches %ms are set so . matches
#    newlines across WHOLE document (i.e., ^ and $ don't stop at \n;
#    see p.233 of Mastering Regular Expressions
# 3. Test regexes by turning on $DEBUG; make sure you individually
#    test EACH engine's different regexes (use the engine.config.test
#    file)! 
# 
# @NOTE: In order to figure out correct regex patterns below, turn on DEBUG'ing and look at generated page as View Source in the Browser looks at JavaScript interpreted page!
# 
---------------------------------------------
SearchEngineName: google
SearchEngineNote: 
SearchEngineURL: http://www.google.com/search?sourceid=sethi&hl=en&ie=UTF-8&oe=UTF-8&start=0&q=
SearchEngineLogo: http://www.google.com/logos/Logo_40gry.gif
RegExNoMatchState: Positive
RegExNoMatch: did not match any documents in this database
SearchType: Get
PostSubmitContent: 
TotalFoundRegEx: Results <b>.*?</b> - <b>.*?</b> of about <b>(.*)</b>
LimitPages: 
NumberHitsPerPage: 10
PositionIdentifierRegex: 
# Not sure why the </font> works... but it breaks the page into chunks
# on the </font> and then searches for hitStart and hitEnd to qualify
# whether or not that chunk contains a legitimate hit or not...
# 2006-05-21: Apparently, the comments are parsed out by default with LWP
#HitSeparator: <!--n-->
#HitSeparator: </table>
HitSeparator: </td></tr></table></div>
# Added in for Google on 2007-04-03:
HitSeparatorStart: <div><div class=g>
HitSeparatorEnd: 
#HitStart: <p class=g><!--m--><link rel="prefetch" href=
#HitStart: <p class=g><a class=l href=
HitStart: <div class=g><a href=
HitEnd: </a>
PageCounterVariable: start=
PageCounterIncrementer: 10
PageCounterStarter: 0
ENDengineDEFINITION
---------------------------------------------
# For Yahoo, check to see if it says "Go To Web Page Matches" or "Next
# __ Matches" which means that it found the search term on local
# yahoo; so if it's successful like that, simply return the page
# number it's on.  If it says "Web Pages (1-2 of 2)" or "Sorry, no
# matches were found containing" then no Yahoo match.
# 
# Do a Yahoo!Web search separately (treat as different engine).
# 
# Web Pages search:
# http://ink.yahoo.com/bin/query?p=foo&hc=2&hs=24
# Yahoo Categories search:
# http://search.yahoo.com/search?p=foo&hc=2&hs=24&h=c
# Yahoo Sites search:
# http://search.yahoo.com/search?p=foo&hc=2&hs=24&h=s
# Yahoo General search:
# http://search.yahoo.com/search?p=foo
# 
# For Yahoo! regular (i.e., not web pages) search, ONLY search sites
# since categories can't contain corroborative URL!!!
# 
SearchEngineName: yahoo
SearchEngineNote: 
SearchEngineURL: http://search.yahoo.com/bin/search?b=1&h=s&p=
SearchEngineLogo: http://a1.g.a.yimg.com/7/1/31/000/us.yimg.com/i/yahoo.gif
RegExNoMatchState: Positive
RegExNoMatch: Search\s*?Result.*?Found.*\s*?web pages\s*?for
SearchType: Get
PostSubmitContent: 
TotalFoundRegEx: Found <b>.*?</b>\s*categories\s*and <b>(.*?)</b>\s*sites\s*for
LimitPages: 
NumberHitsPerPage: 20
PositionIdentifierRegex: 
#HitSeparator: \n
HitSeparator: <li>
HitSeparatorStart: 
HitSeparatorEnd: 
HitStart: <big>\s*<a href="
HitEnd: </a>\n<p>
PageCounterVariable: b=
PageCounterIncrementer: 20
PageCounterStarter: 1
ENDengineDEFINITION
---------------------------------------------
# MSN has a lot of screwy ASP/M$ crappola (see associated comments)
SearchEngineName: msn
SearchEngineNote: 
# First one skips from 0-0 here; Second one says we want 20 results per page
SearchEngineURL: http://search.msn.com/results.asp?ba=(0.0)0&co=(0.20)2.1.4&FORM=SMCRT&RS=CHECKED&b=1&v=1&q=
SearchEngineLogo: http://logo.msn.com/en-us/default/logo.gif
RegExNoMatchState: Positive
RegExNoMatch: No sites were found containing
SearchType: Get
PostSubmitContent: 
TotalFoundRegEx: <TD ALIGN=\"right\"><SPAN CLASS=\"clsResultSectionName.*? of (\d+)</SPAN></TD>
LimitPages: 
NumberHitsPerPage: 20
PositionIdentifierRegex: 
HitSeparator: </table>
HitSeparatorStart: 
HitSeparatorEnd: 
HitStart:   <a 
HitEnd: </a><table
# Need to escape the paranthesis for MSN
PageCounterVariable: ba=\(0\.
PageCounterIncrementer: 20
PageCounterStarter: 0
ENDengineDEFINITION
ENDOFCONFIG
    ;
    close (CONFIGFILE);
}


__END__