From: Jeff Hobbs Date: Tue, 19 Sep 2006 22:00:39 +0000 (+0000) Subject: * library/httpvfs.tcl (vfs::http::urlparse): add method to X-Git-Tag: vfs-1-4~52 X-Git-Url: https://privyetmir.co.uk/gitweb.cgi?a=commitdiff_plain;h=cfd685d0e2b49a2ea0a4f5f179d8f3d852b60caa;p=tclvfs * library/httpvfs.tcl (vfs::http::urlparse): add method to deconstruct the url using RFC 3986 semantics. (vfs::http::Mount): add support for HTTP basic auth if a user was passed in the url --- diff --git a/ChangeLog b/ChangeLog index cbd3ad4..d001120 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +2006-09-19 Jeff Hobbs + + * library/httpvfs.tcl (vfs::http::urlparse): add method to + deconstruct the url using RFC 3986 semantics. + (vfs::http::Mount): add support for HTTP basic auth if a user was + passed in the url + 2006-09-15 Jeff Hobbs * library/pkgIndex.tcl: bump vfs::http to 0.6 diff --git a/library/httpvfs.tcl b/library/httpvfs.tcl index 3717ec3..5dd5942 100644 --- a/library/httpvfs.tcl +++ b/library/httpvfs.tcl @@ -4,13 +4,14 @@ package provide vfs::http 0.6 package require vfs 1.0 package require http -# This works for basic operations, but has not been very debugged. +# This works for basic operations, using http GET and HEAD requests +# to serve http data in a read-only file system. namespace eval vfs::http { # Allow for options when mounting an http URL variable options # -urlencode means automatically parse "foo/my file (2).txt" as - # "foo/my%20file%20%282%29.txt", as per RFC 3986, for the user. + # "foo/my%20file%20%282%29.txt", per RFC 3986, for the user. set options(-urlencode) 1 # -urlparse would further parse URLs for ? (query string) and # (anchor) # components, leaving those unencoded. Only works when -urlencode is true. @@ -22,6 +23,8 @@ proc vfs::http::Mount {dirurl local args} { variable options foreach {key val} $args { # only do exact option name matching for now + # We could consider allowing general http options here, + # but those would be per-mount if {[info exists options($key)]} { # currently only boolean values if {![string is boolean -strict $val]} { @@ -30,62 +33,234 @@ proc vfs::http::Mount {dirurl local args} { set options($key) $val } } - if {[string index $dirurl end] ne "/"} { - append dirurl "/" - } - if {[string match "http://*" $dirurl]} { - set rest [string range $dirurl 7 end] - } else { - set rest $dirurl - set dirurl "http://${dirurl}" - } - if {![regexp {(([^:]*)(:([^@]*))?@)?([^/]*)(/(.*/)?([^/]*))?$} $rest \ - junk junk user junk pass host junk path file]} { - return -code error "unable to parse url \"$dirurl\"" - } + # Break the url into parts, verifying url + array set parts [urlparse $dirurl] - if {[string length $file]} { - return -code error "Can only mount directories, not\ - files (perhaps you need a trailing '/' - I understood\ - a path '$path' and file '$file')" + if {[info exists parts(query)] || [info exists parts(anchor)]} { + return -code error "invalid url \"$dirurl\":\ + no query string or anchor fragments allowed" } - if {$user eq ""} { - set user anonymous + if {[info exists parts(user)]} { + # At this point we need base64 for HTTP Basic AUTH + package require base64 + foreach {user passwd} [split $parts(user) :] { break } + set auth "Basic [base64::encode $user:$passwd]" + set headers [list Authorization $auth] + } else { + set headers "" } - set token [::http::geturl $dirurl -validate 1] + set token [::http::geturl $parts(url) -validate 1 -headers $headers] http::wait $token set status [http::status $token] http::cleanup $token if {$status ne "ok"} { # we'll take whatever http agrees is "ok" - return -code error "received status \"$status\" for \"$dirurl\"" + return -code error "received status \"$status\" for \"$parts(url)\"" } - if {![catch {vfs::filesystem info $dirurl}]} { + # Add a / to make sure the url and names are clearly separated later + if {[string index $parts(url) end] ne "/"} { + append parts(url) "/" + } + + if {![catch {vfs::filesystem info $parts(url)}]} { # unmount old mount - ::vfs::log "ftp-vfs: unmounted old mount point at $dirurl" - vfs::unmount $dirurl + ::vfs::log "ftp-vfs: unmounted old mount point at $parts(url)" + vfs::unmount $parts(url) } - ::vfs::log "http $host, $path mounted at $local" - vfs::filesystem mount $local [list vfs::http::handler $dirurl $path] - # Register command to unmount - vfs::RegisterMount $local [list ::vfs::http::Unmount $dirurl] - return $dirurl + ::vfs::log "http $dirurl ($parts(url)) mounted at $local" + # Pass headers along as they may include authentication + vfs::filesystem mount $local \ + [list vfs::http::handler $parts(url) $headers $parts(file)] + # Register command to unmount - headers not needed + vfs::RegisterMount $local [list ::vfs::http::Unmount $parts(url)] + return $parts(url) } -proc vfs::http::Unmount {dirurl local} { +proc vfs::http::Unmount {url local} { vfs::filesystem unmount $local } -proc vfs::http::handler {dirurl path cmd root relative actualpath args} { +proc vfs::http::handler {url headers path cmd root relative actualpath args} { if {$cmd eq "matchindirectory"} { - eval [linsert $args 0 $cmd $dirurl $relative $actualpath] + eval [linsert $args 0 $cmd $url $headers $relative $actualpath] + } else { + eval [linsert $args 0 $cmd $url $headers $relative] + } +} + +proc vfs::http::urlparse {url} { + # Taken from http 2.5.3 + + # Validate URL by parts. We suck out user:pass if it exists as the + # core http package does not automate HTTP Basic Auth yet. + + # Returns data in [array get] format. The url, host and file keys are + # guaranteed to exist. proto, port, query, anchor, and user should be + # checked with [info exists]. (user may contain password) + + # URLs have basically four parts. + # First, before the colon, is the protocol scheme (e.g. http) + # Second, for HTTP-like protocols, is the authority + # The authority is preceded by // and lasts up to (but not including) + # the following / and it identifies up to four parts, of which only one, + # the host, is required (if an authority is present at all). All other + # parts of the authority (user name, password, port number) are optional. + # Third is the resource name, which is split into two parts at a ? + # The first part (from the single "/" up to "?") is the path, and the + # second part (from that "?" up to "#") is the query. *HOWEVER*, we do + # not need to separate them; we send the whole lot to the server. + # Fourth is the fragment identifier, which is everything after the first + # "#" in the URL. The fragment identifier MUST NOT be sent to the server + # and indeed, we don't bother to validate it (it could be an error to + # pass it in here, but it's cheap to strip). + # + # An example of a URL that has all the parts: + # http://jschmoe:xyzzy@www.bogus.net:8000/foo/bar.tml?q=foo#changes + # The "http" is the protocol, the user is "jschmoe", the password is + # "xyzzy", the host is "www.bogus.net", the port is "8000", the path is + # "/foo/bar.tml", the query is "q=foo", and the fragment is "changes". + # + # Note that the RE actually combines the user and password parts, as + # recommended in RFC 3986. Indeed, that RFC states that putting passwords + # in URLs is a Really Bad Idea, something with which I would agree utterly. + # Also note that we do not currently support IPv6 addresses. + # + # From a validation perspective, we need to ensure that the parts of the + # URL that are going to the server are correctly encoded. + + set URLmatcher {(?x) # this is _expanded_ syntax + ^ + (?: (\w+) : ) ? # + (?: // + (?: + ( + [^@/\#?]+ # + ) @ + )? + ( [^/:\#?]+ ) # + (?: : (\d+) )? # + )? + ( / [^\#?]* (?: \? [^\#?]* )?)? # (including query) + (?: \# (.*) )? # (aka anchor) + $ + } + + # Phase one: parse + if {![regexp -- $URLmatcher $url -> proto user host port srvurl anchor]} { + unset $token + return -code error "Unsupported URL: $url" + } + # Phase two: validate + if {$host eq ""} { + # Caller has to provide a host name; we do not have a "default host" + # that would enable us to handle relative URLs. + unset $token + return -code error "Missing host part: $url" + # Note that we don't check the hostname for validity here; if it's + # invalid, we'll simply fail to resolve it later on. + } + if {$port ne "" && $port>65535} { + unset $token + return -code error "Invalid port number: $port" + } + # The user identification and resource identification parts of the URL can + # have encoded characters in them; take care! + if {$user ne ""} { + # Check for validity according to RFC 3986, Appendix A + set validityRE {(?xi) + ^ + (?: [-\w.~!$&'()*+,;=:] | %[0-9a-f][0-9a-f] )+ + $ + } + if {![regexp -- $validityRE $user]} { + unset $token + # Provide a better error message in this error case + if {[regexp {(?i)%(?![0-9a-f][0-9a-f]).?.?} $user bad]} { + return -code error \ + "Illegal encoding character usage \"$bad\" in URL user" + } + return -code error "Illegal characters in URL user" + } + } + if {$srvurl ne ""} { + # Check for validity according to RFC 3986, Appendix A + set validityRE {(?xi) + ^ + # Path part (already must start with / character) + (?: [-\w.~!$&'()*+,;=:@/] | %[0-9a-f][0-9a-f] )* + # Query part (optional, permits ? characters) + (?: \? (?: [-\w.~!$&'()*+,;=:@/?] | %[0-9a-f][0-9a-f] )* )? + $ + } + if {![regexp -- $validityRE $srvurl]} { + unset $token + # Provide a better error message in this error case + if {[regexp {(?i)%(?![0-9a-f][0-9a-f])..} $srvurl bad]} { + return -code error \ + "Illegal encoding character usage \"$bad\" in URL path" + } + return -code error "Illegal characters in URL path" + } } else { - eval [linsert $args 0 $cmd $dirurl $relative] + set srvurl / + } + if {$proto eq ""} { + set proto http + } else { + set result(proto) $proto + } + + # Here we vary from core http + + # vfs::http - we only support http at this time. Perhaps https later? + if {$proto ne "http"} { + return -code error "Unsupported URL type \"$proto\"" } + + # OK, now reassemble into a full URL, with result containing the + # parts that exist and will be returned to the user + array set result {} + set url ${proto}:// + if {$user ne ""} { + set result(user) $user + # vfs::http will do HTTP basic auth on their existence, + # but we pass these through as they are innocuous + append url $user + append url @ + } + append url $host + set result(host) $host + if {$port ne ""} { + # don't bother with adding default port + append url : $port + set result(port) $port + } + append url $srvurl + if {$anchor ne ""} { + # XXX: Don't append see the anchor, as it is generally a client-side + # XXX: item. The user can add it back if they want. + #append url \# $anchor + set result(anchor) $anchor + } + + set idx [string first ? $srvurl] + if {$idx >= 0} { + set query [string range [expr {$idx+1}] end] + set file [string range 0 [expr {$idx-1}]] + set result(file) $file + set result(query) $query + } else { + set result(file) $srvurl + } + + set result(url) $url + + # return array format list of items + return [array get result] } proc vfs::http::urlname {name} { @@ -116,7 +291,7 @@ proc vfs::http::urlname {name} { # If we implement the commands below, we will have a perfect # virtual file system for remote http sites. -proc vfs::http::stat {dirurl name} { +proc vfs::http::stat {dirurl headers name} { set urlname [urlname $name] ::vfs::log "stat $name ($urlname)" @@ -124,7 +299,7 @@ proc vfs::http::stat {dirurl name} { # as a file (not a directory) since with http, even directories # really behave as the index.html they contain. - set token [::http::geturl "$dirurl$urlname" -validate 1] + set token [::http::geturl "$dirurl$urlname" -validate 1 -headers $headers] http::wait $token set ncode [http::ncode $token] if {$ncode == 404 || [http::status $token] ne "ok"} { @@ -143,7 +318,7 @@ proc vfs::http::stat {dirurl name} { return $res } -proc vfs::http::access {dirurl name mode} { +proc vfs::http::access {dirurl headers name mode} { set urlname [urlname $name] ::vfs::log "access $name $mode ($urlname)" if {$mode & 2} { @@ -151,7 +326,7 @@ proc vfs::http::access {dirurl name mode} { return -code error "read-only" } if {$name == ""} { return 1 } - set token [::http::geturl "$dirurl$urlname" -validate 1] + set token [::http::geturl "$dirurl$urlname" -validate 1 -headers $headers] http::wait $token set ncode [http::ncode $token] if {$ncode == 404 || [http::status $token] ne "ok"} { @@ -169,7 +344,7 @@ proc vfs::http::access {dirurl name mode} { # We've chosen to implement these channels by using a memchan. # The alternative would be to use temporary files. -proc vfs::http::open {dirurl name mode permissions} { +proc vfs::http::open {dirurl headers name mode permissions} { set urlname [urlname $name] ::vfs::log "open $name $mode $permissions ($urlname)" # return a list of two elements: @@ -179,7 +354,7 @@ proc vfs::http::open {dirurl name mode permissions} { switch -glob -- $mode { "" - "r" { - set token [::http::geturl "$dirurl$urlname"] + set token [::http::geturl "$dirurl$urlname" -headers $headers] set filed [vfs::memchan] fconfigure $filed -translation binary @@ -202,7 +377,7 @@ proc vfs::http::open {dirurl name mode permissions} { } } -proc vfs::http::matchindirectory {dirurl path actualpath pattern type} { +proc vfs::http::matchindirectory {dirurl headers path actualpath pattern type} { ::vfs::log "matchindirectory $path $pattern $type" set res [list] @@ -218,22 +393,22 @@ proc vfs::http::matchindirectory {dirurl path actualpath pattern type} { return $res } -proc vfs::http::createdirectory {dirurl name} { +proc vfs::http::createdirectory {dirurl headers name} { ::vfs::log "createdirectory $name" vfs::filesystem posixerror $::vfs::posix(EROFS) } -proc vfs::http::removedirectory {dirurl name recursive} { +proc vfs::http::removedirectory {dirurl headers name recursive} { ::vfs::log "removedirectory $name" vfs::filesystem posixerror $::vfs::posix(EROFS) } -proc vfs::http::deletefile {dirurl name} { +proc vfs::http::deletefile {dirurl headers name} { ::vfs::log "deletefile $name" vfs::filesystem posixerror $::vfs::posix(EROFS) } -proc vfs::http::fileattributes {dirurl path args} { +proc vfs::http::fileattributes {dirurl headers path args} { ::vfs::log "fileattributes $args" switch -- [llength $args] { 0 { @@ -253,7 +428,6 @@ proc vfs::http::fileattributes {dirurl path args} { } } -proc vfs::http::utime {dirurl path actime mtime} { +proc vfs::http::utime {dirurl headers path actime mtime} { vfs::filesystem posixerror $::vfs::posix(EROFS) } -