Upgrade to Snoopy 1.2.3

git-svn-id: http://svn.automattic.com/wordpress/trunk@3191 1a063a9b-81f0-0310-95a4-ce76da25c4cd
This commit is contained in:
ryan 2005-11-22 01:14:26 +00:00
parent 73dde55507
commit a9d801767f
1 changed files with 452 additions and 94 deletions

View File

@ -5,7 +5,7 @@
Snoopy - the PHP net client
Author: Monte Ohrt <monte@ispi.net>
Copyright (c): 1999-2000 ispi, all rights reserved
Version: 1.0
Version: 1.01
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
@ -31,7 +31,7 @@ CTO, ispi
Lincoln, NE 68510
The latest version of Snoopy can be obtained from:
http://snoopy.sourceforge.net
http://snoopy.sourceforge.net/
*************************************************/
@ -46,7 +46,10 @@ class Snoopy
var $port = 80; // port we are connecting to
var $proxy_host = ""; // proxy host to use
var $proxy_port = ""; // proxy port to use
var $agent = "Snoopy v1.0"; // agent we masquerade as
var $proxy_user = ""; // proxy user to use
var $proxy_pass = ""; // proxy password to use
var $agent = "Snoopy v1.2.3"; // agent we masquerade as
var $referer = ""; // referer info to pass
var $cookies = array(); // array of cookies to pass
// $cookies["username"]="joe";
@ -59,7 +62,7 @@ class Snoopy
var $maxframes = 0; // frame content depth maximum. 0 = disallow
var $expandlinks = true; // expand links to fully qualified URLs.
// this only applies to fetchlinks()
// or submitlinks()
// submitlinks(), and submittext()
var $passcookies = true; // pass set cookies back through redirects
// NOTE: this currently does not respect
// dates, domains or paths.
@ -81,8 +84,12 @@ class Snoopy
// set to 0 to disallow timeouts
var $timed_out = false; // if a read operation timed out
var $status = 0; // http request status
var $curl_path = "/usr/bin/curl";
var $temp_dir = "/tmp"; // temporary directory that the webserver
// has permission to write to.
// under Windows, this should be C:\temp
var $curl_path = "/usr/local/bin/curl";
// Snoopy will use cURL for fetching
// SSL content if a full system path to
// the cURL binary is supplied here.
@ -94,9 +101,6 @@ class Snoopy
// as these functions are not stable
// as of this Snoopy release.
// send Accept-encoding: gzip?
var $use_gzip = true;
/**** Private variables ****/
var $_maxlinelen = 4096; // max line length (headers)
@ -132,8 +136,12 @@ class Snoopy
$this->user = $URI_PARTS["user"];
if (!empty($URI_PARTS["pass"]))
$this->pass = $URI_PARTS["pass"];
if (empty($URI_PARTS["query"]))
$URI_PARTS["query"] = '';
if (empty($URI_PARTS["path"]))
$URI_PARTS["path"] = '';
switch($URI_PARTS["scheme"])
switch(strtolower($URI_PARTS["scheme"]))
{
case "http":
$this->host = $URI_PARTS["host"];
@ -148,7 +156,7 @@ class Snoopy
}
else
{
$path = $URI_PARTS["path"].(isset($URI_PARTS["query"]) ? "?".$URI_PARTS["query"] : "");
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
// no proxy, send only the path
$this->_httprequest($path, $fp, $URI, $this->_httpmethod);
}
@ -195,10 +203,11 @@ class Snoopy
return true;
break;
case "https":
if(!$this->curl_path || (!is_executable($this->curl_path))) {
$this->error = "Bad curl ($this->curl_path), can't fetch HTTPS \n";
if(!$this->curl_path)
return false;
}
if(function_exists("is_executable"))
if (!is_executable($this->curl_path))
return false;
$this->host = $URI_PARTS["host"];
if(!empty($URI_PARTS["port"]))
$this->port = $URI_PARTS["port"];
@ -257,7 +266,346 @@ class Snoopy
return true;
}
/*======================================================================*\
Function: submit
Purpose: submit an http form
Input: $URI the location to post the data
$formvars the formvars to use.
format: $formvars["var"] = "val";
$formfiles an array of files to submit
format: $formfiles["var"] = "/dir/filename.ext";
Output: $this->results the text output from the post
\*======================================================================*/
function submit($URI, $formvars="", $formfiles="")
{
unset($postdata);
$postdata = $this->_prepare_post_body($formvars, $formfiles);
$URI_PARTS = parse_url($URI);
if (!empty($URI_PARTS["user"]))
$this->user = $URI_PARTS["user"];
if (!empty($URI_PARTS["pass"]))
$this->pass = $URI_PARTS["pass"];
if (empty($URI_PARTS["query"]))
$URI_PARTS["query"] = '';
if (empty($URI_PARTS["path"]))
$URI_PARTS["path"] = '';
switch(strtolower($URI_PARTS["scheme"]))
{
case "http":
$this->host = $URI_PARTS["host"];
if(!empty($URI_PARTS["port"]))
$this->port = $URI_PARTS["port"];
if($this->_connect($fp))
{
if($this->_isproxy)
{
// using proxy, send entire URI
$this->_httprequest($URI,$fp,$URI,$this->_submit_method,$this->_submit_type,$postdata);
}
else
{
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
// no proxy, send only the path
$this->_httprequest($path, $fp, $URI, $this->_submit_method, $this->_submit_type, $postdata);
}
$this->_disconnect($fp);
if($this->_redirectaddr)
{
/* url was redirected, check if we've hit the max depth */
if($this->maxredirs > $this->_redirectdepth)
{
if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
$this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);
// only follow redirect if it's on this site, or offsiteok is true
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
{
/* follow the redirect */
$this->_redirectdepth++;
$this->lastredirectaddr=$this->_redirectaddr;
if( strpos( $this->_redirectaddr, "?" ) > 0 )
$this->fetch($this->_redirectaddr); // the redirect has changed the request method from post to get
else
$this->submit($this->_redirectaddr,$formvars, $formfiles);
}
}
}
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
{
$frameurls = $this->_frameurls;
$this->_frameurls = array();
while(list(,$frameurl) = each($frameurls))
{
if($this->_framedepth < $this->maxframes)
{
$this->fetch($frameurl);
$this->_framedepth++;
}
else
break;
}
}
}
else
{
return false;
}
return true;
break;
case "https":
if(!$this->curl_path)
return false;
if(function_exists("is_executable"))
if (!is_executable($this->curl_path))
return false;
$this->host = $URI_PARTS["host"];
if(!empty($URI_PARTS["port"]))
$this->port = $URI_PARTS["port"];
if($this->_isproxy)
{
// using proxy, send entire URI
$this->_httpsrequest($URI, $URI, $this->_submit_method, $this->_submit_type, $postdata);
}
else
{
$path = $URI_PARTS["path"].($URI_PARTS["query"] ? "?".$URI_PARTS["query"] : "");
// no proxy, send only the path
$this->_httpsrequest($path, $URI, $this->_submit_method, $this->_submit_type, $postdata);
}
if($this->_redirectaddr)
{
/* url was redirected, check if we've hit the max depth */
if($this->maxredirs > $this->_redirectdepth)
{
if(!preg_match("|^".$URI_PARTS["scheme"]."://|", $this->_redirectaddr))
$this->_redirectaddr = $this->_expandlinks($this->_redirectaddr,$URI_PARTS["scheme"]."://".$URI_PARTS["host"]);
// only follow redirect if it's on this site, or offsiteok is true
if(preg_match("|^http://".preg_quote($this->host)."|i",$this->_redirectaddr) || $this->offsiteok)
{
/* follow the redirect */
$this->_redirectdepth++;
$this->lastredirectaddr=$this->_redirectaddr;
if( strpos( $this->_redirectaddr, "?" ) > 0 )
$this->fetch($this->_redirectaddr); // the redirect has changed the request method from post to get
else
$this->submit($this->_redirectaddr,$formvars, $formfiles);
}
}
}
if($this->_framedepth < $this->maxframes && count($this->_frameurls) > 0)
{
$frameurls = $this->_frameurls;
$this->_frameurls = array();
while(list(,$frameurl) = each($frameurls))
{
if($this->_framedepth < $this->maxframes)
{
$this->fetch($frameurl);
$this->_framedepth++;
}
else
break;
}
}
return true;
break;
default:
// not a valid protocol
$this->error = 'Invalid protocol "'.$URI_PARTS["scheme"].'"\n';
return false;
break;
}
return true;
}
/*======================================================================*\
Function: fetchlinks
Purpose: fetch the links from a web page
Input: $URI where you are fetching from
Output: $this->results an array of the URLs
\*======================================================================*/
function fetchlinks($URI)
{
if ($this->fetch($URI))
{
if($this->lastredirectaddr)
$URI = $this->lastredirectaddr;
if(is_array($this->results))
{
for($x=0;$x<count($this->results);$x++)
$this->results[$x] = $this->_striplinks($this->results[$x]);
}
else
$this->results = $this->_striplinks($this->results);
if($this->expandlinks)
$this->results = $this->_expandlinks($this->results, $URI);
return true;
}
else
return false;
}
/*======================================================================*\
Function: fetchform
Purpose: fetch the form elements from a web page
Input: $URI where you are fetching from
Output: $this->results the resulting html form
\*======================================================================*/
function fetchform($URI)
{
if ($this->fetch($URI))
{
if(is_array($this->results))
{
for($x=0;$x<count($this->results);$x++)
$this->results[$x] = $this->_stripform($this->results[$x]);
}
else
$this->results = $this->_stripform($this->results);
return true;
}
else
return false;
}
/*======================================================================*\
Function: fetchtext
Purpose: fetch the text from a web page, stripping the links
Input: $URI where you are fetching from
Output: $this->results the text from the web page
\*======================================================================*/
function fetchtext($URI)
{
if($this->fetch($URI))
{
if(is_array($this->results))
{
for($x=0;$x<count($this->results);$x++)
$this->results[$x] = $this->_striptext($this->results[$x]);
}
else
$this->results = $this->_striptext($this->results);
return true;
}
else
return false;
}
/*======================================================================*\
Function: submitlinks
Purpose: grab links from a form submission
Input: $URI where you are submitting from
Output: $this->results an array of the links from the post
\*======================================================================*/
function submitlinks($URI, $formvars="", $formfiles="")
{
if($this->submit($URI,$formvars, $formfiles))
{
if($this->lastredirectaddr)
$URI = $this->lastredirectaddr;
if(is_array($this->results))
{
for($x=0;$x<count($this->results);$x++)
{
$this->results[$x] = $this->_striplinks($this->results[$x]);
if($this->expandlinks)
$this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
}
}
else
{
$this->results = $this->_striplinks($this->results);
if($this->expandlinks)
$this->results = $this->_expandlinks($this->results,$URI);
}
return true;
}
else
return false;
}
/*======================================================================*\
Function: submittext
Purpose: grab text from a form submission
Input: $URI where you are submitting from
Output: $this->results the text from the web page
\*======================================================================*/
function submittext($URI, $formvars = "", $formfiles = "")
{
if($this->submit($URI,$formvars, $formfiles))
{
if($this->lastredirectaddr)
$URI = $this->lastredirectaddr;
if(is_array($this->results))
{
for($x=0;$x<count($this->results);$x++)
{
$this->results[$x] = $this->_striptext($this->results[$x]);
if($this->expandlinks)
$this->results[$x] = $this->_expandlinks($this->results[$x],$URI);
}
}
else
{
$this->results = $this->_striptext($this->results);
if($this->expandlinks)
$this->results = $this->_expandlinks($this->results,$URI);
}
return true;
}
else
return false;
}
/*======================================================================*\
Function: set_submit_multipart
Purpose: Set the form submission content type to
multipart/form-data
\*======================================================================*/
function set_submit_multipart()
{
$this->_submit_type = "multipart/form-data";
}
/*======================================================================*\
Function: set_submit_normal
Purpose: Set the form submission content type to
application/x-www-form-urlencoded
\*======================================================================*/
function set_submit_normal()
{
$this->_submit_type = "application/x-www-form-urlencoded";
}
/*======================================================================*\
Private functions
@ -273,7 +621,7 @@ class Snoopy
function _striplinks($document)
{
preg_match_all("'<\s*a\s+.*href\s*=\s* # find <a href=
preg_match_all("'<\s*a\s.*?href\s*=\s* # find <a href=
([\"\'])? # find single or double quote
(?(1) (.*?)\\1 | ([^\s\>]+)) # if quote found, match up to next matching
# quote, otherwise match up to next space
@ -335,16 +683,27 @@ class Snoopy
$search = array("'<script[^>]*?>.*?</script>'si", // strip out javascript
"'<[\/\!]*?[^<>]*?>'si", // strip out html tags
"'([\r\n])[\s]+'", // strip out white space
"'&(quote|#34);'i", // replace html entities
"'&(amp|#38);'i",
"'&(lt|#60);'i",
"'&(gt|#62);'i",
"'&(nbsp|#160);'i",
"'&(quot|#34|#034|#x22);'i", // replace html entities
"'&(amp|#38|#038|#x26);'i", // added hexadecimal values
"'&(lt|#60|#060|#x3c);'i",
"'&(gt|#62|#062|#x3e);'i",
"'&(nbsp|#160|#xa0);'i",
"'&(iexcl|#161);'i",
"'&(cent|#162);'i",
"'&(pound|#163);'i",
"'&(copy|#169);'i"
);
"'&(copy|#169);'i",
"'&(reg|#174);'i",
"'&(deg|#176);'i",
"'&(#39|#039|#x27);'",
"'&(euro|#8364);'i", // europe
"'&a(uml|UML);'", // german
"'&o(uml|UML);'",
"'&u(uml|UML);'",
"'&A(uml|UML);'",
"'&O(uml|UML);'",
"'&U(uml|UML);'",
"'&szlig;'i",
);
$replace = array( "",
"",
"\\1",
@ -356,7 +715,19 @@ class Snoopy
chr(161),
chr(162),
chr(163),
chr(169));
chr(169),
chr(174),
chr(176),
chr(39),
chr(128),
"ä",
"ö",
"ü",
"Ä",
"Ö",
"Ü",
"ß",
);
$text = preg_replace($search,$replace,$document);
@ -377,14 +748,20 @@ class Snoopy
preg_match("/^[^\?]+/",$URI,$match);
$match = preg_replace("|/[^\/\.]+\.[^\/\.]+$|","",$match[0]);
$match = preg_replace("|/$|","",$match);
$match_part = parse_url($match);
$match_root =
$match_part["scheme"]."://".$match_part["host"];
$search = array( "|^http://".preg_quote($this->host)."|i",
"|^(?!http://)(\/)?(?!mailto:)|i",
"|^(\/)|i",
"|^(?!http://)(?!mailto:)|i",
"|/\./|",
"|/[^\/]+/\.\./|"
);
$replace = array( "",
$match_root."/",
$match."/",
"/",
"/"
@ -407,6 +784,7 @@ class Snoopy
function _httprequest($url,$fp,$URI,$http_method,$content_type="",$body="")
{
$cookie_headers = '';
if($this->passcookies && $this->_redirectaddr)
$this->setcookies();
@ -416,25 +794,14 @@ class Snoopy
$headers = $http_method." ".$url." ".$this->_httpversion."\r\n";
if(!empty($this->agent))
$headers .= "User-Agent: ".$this->agent."\r\n";
if(!empty($this->host) && !isset($this->rawheaders['Host']))
$headers .= "Host: ".$this->host."\r\n";
if(!empty($this->host) && !isset($this->rawheaders['Host'])) {
$headers .= "Host: ".$this->host;
if(!empty($this->port))
$headers .= ":".$this->port;
$headers .= "\r\n";
}
if(!empty($this->accept))
$headers .= "Accept: ".$this->accept."\r\n";
if($this->use_gzip) {
// make sure PHP was built with --with-zlib
// and we can handle gzipp'ed data
if ( function_exists(gzinflate) ) {
$headers .= "Accept-encoding: gzip\r\n";
}
else {
trigger_error(
"use_gzip is on, but PHP was built without zlib support.".
" Requesting file(s) without gzip encoding.",
E_USER_NOTICE);
}
}
if(!empty($this->referer))
$headers .= "Referer: ".$this->referer."\r\n";
if(!empty($this->cookies))
@ -467,7 +834,12 @@ class Snoopy
if(!empty($body))
$headers .= "Content-length: ".strlen($body)."\r\n";
if(!empty($this->user) || !empty($this->pass))
$headers .= "Authorization: BASIC ".base64_encode($this->user.":".$this->pass)."\r\n";
$headers .= "Authorization: Basic ".base64_encode($this->user.":".$this->pass)."\r\n";
//add proxy auth headers
if(!empty($this->proxy_user))
$headers .= 'Proxy-Authorization: ' . 'Basic ' . base64_encode($this->proxy_user . ':' . $this->proxy_pass)."\r\n";
$headers .= "\r\n";
@ -480,9 +852,6 @@ class Snoopy
$this->_redirectaddr = false;
unset($this->headers);
// content was returned gzip encoded?
$is_gzipped = false;
while($currentHeader = fgets($fp,$this->_maxlinelen))
{
@ -492,15 +861,14 @@ class Snoopy
return false;
}
// if($currentHeader == "\r\n")
if(preg_match("/^\r?\n$/", $currentHeader) )
break;
if($currentHeader == "\r\n")
break;
// if a header begins with Location: or URI:, set the redirect
if(preg_match("/^(Location:|URI:)/i",$currentHeader))
{
// get URL portion of the redirect
preg_match("/^(Location:|URI:)\s+(.*)/",chop($currentHeader),$matches);
preg_match("/^(Location:|URI:)[ ]+(.*)/i",chop($currentHeader),$matches);
// look for :// in the Location header to see if hostname is included
if(!preg_match("|\:\/\/|",$matches[2]))
{
@ -524,31 +892,19 @@ class Snoopy
}
$this->response_code = $currentHeader;
}
if (preg_match("/Content-Encoding: gzip/", $currentHeader) ) {
$is_gzipped = true;
}
$this->headers[] = $currentHeader;
}
# $results = fread($fp, $this->maxlength);
$results = "";
while ( $data = fread($fp, $this->maxlength) ) {
$results .= $data;
if (
strlen($results) > $this->maxlength ) {
break;
}
}
// gunzip
if ( $is_gzipped ) {
// per http://www.php.net/manual/en/function.gzencode.php
$results = substr($results, 10);
$results = gzinflate($results);
}
$results = '';
do {
$_data = fread($fp, $this->maxlength);
if (strlen($_data) == 0) {
break;
}
$results .= $_data;
} while(true);
if ($this->read_timeout > 0 && $this->_check_timeout($fp))
{
$this->status=-100;
@ -557,7 +913,8 @@ class Snoopy
// check if there is a a redirect meta tag
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]*URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
{
$this->_redirectaddr = $this->_expandlinks($match[1],$URI);
}
@ -603,7 +960,10 @@ class Snoopy
if(!empty($this->agent))
$headers[] = "User-Agent: ".$this->agent;
if(!empty($this->host))
$headers[] = "Host: ".$this->host;
if(!empty($this->port))
$headers[] = "Host: ".$this->host.":".$this->port;
else
$headers[] = "Host: ".$this->host;
if(!empty($this->accept))
$headers[] = "Accept: ".$this->accept;
if(!empty($this->referer))
@ -640,8 +1000,10 @@ class Snoopy
if(!empty($this->user) || !empty($this->pass))
$headers[] = "Authorization: BASIC ".base64_encode($this->user.":".$this->pass);
for($curr_header = 0; $curr_header < count($headers); $curr_header++)
$cmdline_params .= " -H \"".$headers[$curr_header]."\"";
for($curr_header = 0; $curr_header < count($headers); $curr_header++) {
$safer_header = strtr( $headers[$curr_header], "\"", " " );
$cmdline_params .= " -H \"".$safer_header."\"";
}
if(!empty($body))
$cmdline_params .= " -d \"$body\"";
@ -649,11 +1011,10 @@ class Snoopy
if($this->read_timeout > 0)
$cmdline_params .= " -m ".$this->read_timeout;
$headerfile = uniqid(time());
# accept self-signed certs
$cmdline_params .= " -k";
exec($this->curl_path." -D \"/tmp/$headerfile\"".$cmdline_params." ".$URI,$results,$return);
$headerfile = tempnam($temp_dir, "sno");
$safer_URI = strtr( $URI, "\"", " " ); // strip quotes from the URI to avoid shell access
exec($this->curl_path." -D \"$headerfile\"".$cmdline_params." \"".$safer_URI."\"",$results,$return);
if($return)
{
@ -664,7 +1025,7 @@ class Snoopy
$results = implode("\r\n",$results);
$result_headers = file("/tmp/$headerfile");
$result_headers = file("$headerfile");
$this->_redirectaddr = false;
unset($this->headers);
@ -676,7 +1037,7 @@ class Snoopy
if(preg_match("/^(Location: |URI: )/i",$result_headers[$currentHeader]))
{
// get URL portion of the redirect
preg_match("/^(Location: |URI:)(.*)/",chop($result_headers[$currentHeader]),$matches);
preg_match("/^(Location: |URI:)\s+(.*)/",chop($result_headers[$currentHeader]),$matches);
// look for :// in the Location header to see if hostname is included
if(!preg_match("|\:\/\/|",$matches[2]))
{
@ -693,19 +1054,14 @@ class Snoopy
}
if(preg_match("|^HTTP/|",$result_headers[$currentHeader]))
{
$this->response_code = $result_headers[$currentHeader];
if(preg_match("|^HTTP/[^\s]*\s(.*?)\s|",$this->response_code, $match))
{
$this->status= $match[1];
}
}
$this->response_code = $result_headers[$currentHeader];
$this->headers[] = $result_headers[$currentHeader];
}
// check if there is a a redirect meta tag
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]+URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
if(preg_match("'<meta[\s]*http-equiv[^>]*?content[\s]*=[\s]*[\"\']?\d+;[\s]*URL[\s]*=[\s]*([^\"\']*?)[\"\']?>'i",$results,$match))
{
$this->_redirectaddr = $this->_expandlinks($match[1],$URI);
}
@ -724,7 +1080,7 @@ class Snoopy
else
$this->results = $results;
unlink("/tmp/$headerfile");
unlink("$headerfile");
return true;
}
@ -738,8 +1094,8 @@ class Snoopy
{
for($x=0; $x<count($this->headers); $x++)
{
if(preg_match("/^set-cookie:[\s]+([^=]+)=([^;]+)/i", $this->headers[$x],$match))
$this->cookies[$match[1]] = $match[2];
if(preg_match('/^set-cookie:[\s]+([^=]+)=([^;]+)/i', $this->headers[$x],$match))
$this->cookies[$match[1]] = urldecode($match[2]);
}
}
@ -773,6 +1129,7 @@ class Snoopy
if(!empty($this->proxy_host) && !empty($this->proxy_port))
{
$this->_isproxy = true;
$host = $this->proxy_host;
$port = $this->proxy_port;
}
@ -838,6 +1195,7 @@ class Snoopy
{
settype($formvars, "array");
settype($formfiles, "array");
$postdata = '';
if (count($formvars) == 0 && count($formfiles) == 0)
return;
@ -898,4 +1256,4 @@ class Snoopy
}
endif;
?>
?>