#!/usr/bin/perl
# -*- perl -*-
#
# Copyright (c) 1997, 2000 DJ Delorie, All Rights Reserved. NO WARRANTEE.
#
push(@INC, split(':', $ENV{'PATH'}));
push(@INC, "/home/apache/bin");
#open(LOCK, ">/tmp/lock");
#flock(LOCK, 2);
if ($ARGV[0] eq "see-script") {
print "Content-type: text/plain\n\n";
open(IN, "ses.cgi");
print while ;
exit 0;
}
$sockaddr = "S n a4 x8";
#require "sys/socket.ph";
use Socket;
require "./common.pl";
push(@INC, "/usr/local/etc/httpd/bin");
push(@INC, split(':', $ENV{"PATH"}));
require "cgi-lib.pl";
&ReadParse;
$in{'url'} =~ s@http://http://@http://@;
print "Content-type: text/html\n\n";
print `header -sky Search Engine Simulator`;
$agent = "Delorie.com SES";
if ($in{'url'}) {
print "
\n";
print "\n";
print "
\n";
if ($in{'url'} !~ m@^http://([^\.]+\.)+[^\.]+@) {
print "Sorry, I can only handle http://some.host/ URLs.\n";
$u = &HtmlEncode($in{'url'});
print "You typed in `$u'.\n";
print `trailer`;
exit 0;
}
$page = &webget($in{'url'});
$has_tables = 1 if $page =~ /<\s*table/i;
if ($webgot_url ne $in{'url'}) {
print "Note: The web server returned a \"redirect\". Search\n";
print "engines may not always follow redirects. The URL below reflect the\n";
print "page I was redirected to.
\n";
}
($date) = $web_header =~ /Last-modified:\s*(.*\S)/i;
$size = length($page);
print "$webgot_url
\n\n";
$page =~ s/[\r\n\t ]+/ /g;
$page =~ s@]* alt="([^\"]*)"[^>]*>@$1@gi;
$page =~ s@]* alt='([^\']*)'[^>]*>@$1@gi;
$page =~ s@]* alt=([^ >]*)[^>]*>@$1@gi;
($title) = $page =~ m@([^<]*)@i;
$title =~ s@.*@@;
if (! $title) {
$title = $in{'url'};
$title =~ s@.*/(.+)@$1@;
$title = "$title";
}
$banner = "$title ";
if ($date) {
$banner .= " $date, $size bytes
\n";
} else {
$banner .= " $size bytes
\n";
}
$start = "\n";
$summary = $page;
$summary =~ s@.*(title|head)>@@i;
# From striphtml
$summary =~ s{}{\ }gsx;
$summary =~ s{\<([^>\'\"]|\"[^\"]*\"|\'[^\']*\')*\>}{\ }gsx; #"
$summary =~ s/\ / /g;
$summary =~ s/ +/ /g;
$summary =~ s/^ +//g;
$summary =~ s/^(.{1,200}\S) .*/$1/;
$summary =~ s/\\</g;
$summary =~ s/\>/\>/g;
print "Example 1: Summary based on all text on page
\n";
print $start, $banner, $summary, $trailer, $end;
$headers = $page;
$htext = "";
$i = 0;
while ($headers =~ m@]*>(.*?)?(h[1-6]|tr|td|th|table)>@ig) {
$htext .= $1;
$htext .= " ";
last if $i++ == 100; # just in case
}
$htext =~ s{}{}gsx;
$htext =~ s@\<([^\>\'\"]|\"[^\"]*\"|\'[^\']*\')*\>@@gsx; #"
$htext =~ s/\ / /g;
$htext =~ s/ +/ /g;
$htext =~ s/^ +//g;
$htext =~ s/^(.{1,200}\S) .*/$1/;
$htext =~ s/</g;
$htext =~ s/>/>/g;
print "Example 2: Summary based on headers only
\n";
print $start, $banner, $htext, $trailer, $end;
$text = $page;
$text =~ s/[\r\n\t\ ]+/ /g;
# From striphtml
$text =~ s{}{\ }gsx;
$text =~ s{\<([^>\'\"]|\"[^\"]*\"|\'[^\']*\')*\>}{\ }gsx; #"
$text =~ s/\ / /g;
$text =~ s/ +/ /g;
$text =~ s/^ +//g;
$text =~ s/\\</g;
$text =~ s/\>/\>/g;
print "Example 3: Text excerpt of page
\n";
print $start, "", $text, "", $end;
if ($has_tables) { print <If your page uses tables, and the wrong text is showing up in these
examples (i.e. the sidebar instead of the body), I have some helpful hints for laying out your tables
to make your main text show up first
EOF
}
} else {
print "This service allows web authors to see what their pages will look\n";
print "like (sort of) to a search engine. This service ignores the META\n";
print "tags that some search engines honor.
\n";
print "\n";
print <If your page uses tables, and the wrong text is showing up in these
examples (i.e. the sidebar instead of the body), I have some helpful hints for laying out your tables
to make your main text show up first
EOF
print "See the CGI's perl source
\n";
open(V, "viewers.html");
print while ;
close(V);
}
print `trailer`;