#! /bin/sh
# This is a shell archive, meaning:
# 1. Remove everything above the #! /bin/sh line.
# 2. Save the resulting text in a file.
# 3. Execute the file with /bin/sh (not csh) to create the files:
# Copyright
# README
# kde
# kde.S
# This archive created: Fri Jun 18 11:16:16 1993
# By: G.P.Nason ()
export PATH; PATH=/bin:$PATH
echo shar: extracting "'Copyright'" '(605 characters)'
if test -f 'Copyright'
then
echo shar: will not over-write existing file "'Copyright'"
else
cat << \SHAR_EOF > 'Copyright'
/*
* Copyright (C) 1993 Guy Nason
*
* Permission to use, copy, and distribute this software and its
* documentation for any non-commercial purpose without fee
* is hereby granted, provided that the above copyright notice appear
* in all copies and that both that copyright notice and this permission
* notice appear in supporting documentation.
*
* Permission to modify the software is granted, but not the right to
* distribute the modified code.
*
* This software is provided "as is" without express or implied warranty.
*
*
* AUTHOR
*
* Guy Nason, G.P.Nason@bristol.ac.uk
*
*/
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'README'" '(1371 characters)'
if test -f 'README'
then
echo shar: will not over-write existing file "'README'"
else
cat << \SHAR_EOF > 'README'
Two-dimensional kernel density estimation.
This submission consists of two files.
kde.S: This is the S source. To access these you need to enter
S and type
source("kde.S")
Three functions will appear in your .Data directory.
They should be functionally equivalent.
kde: faster than kde2, but may not work on
large problems,
kde2: slower than kde, but may work on larger problems
dxmake: subsidiary function for kde
kde: This is the help file for the two functions. It should be
placed in the .Data/.Help (or .Functions/.Help) directory.
Then you should be able to type help("kde") as normal.
The method behind the software can be found in
Silverman, B.W. (1986) Density estimation: for statistics and data analysis
Chapman and Hall, London. (especially Chapter 4)
The output of the functions can be passed directly to the contour function
in SPlus.
If you have a good contouring package then kde can produce partial derivatives
of the density estimate and these can be used to produce smooth contours.
Have fun!
If you have any problems with the software then email
G.P.Nason@bristol.ac.uk
and I might be able to help.
Guy Nason
Statistics Group
Department of Mathematics
University of Bristol
University Walk
BRISTOL BS8 1TW
England,UK
Telephone: +44 272 303030x4354 Email: G.P.Nason@bristol.ac.uk
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'kde'" '(4081 characters)'
if test -f 'kde'
then
echo shar: will not over-write existing file "'kde'"
else
cat << \SHAR_EOF > 'kde'
.BG
.FN kde
.TL
Estimate Two-Dimensional Probability Density Function (also kde2)
.DN
This function computes a non-parametric estimate of the probability
density of two-dimensional data (and partial derivatives)
using a two-dimensional kernel density estimate.
There is an option to manually change the bandwidth (window width)
of the density estimate.
.CS
kde(data.x, data.y, n=10, width=<>, from=<>,
to=<>, derivatives=F)
.RA
.AG data.x
Vector containing the x coordinate of your data
.AG data.y
Vector containing the y coordinate of your data
.OA
.AG n
The density is estimated on a grid; n equally spaced points are chosen
on the x and y axes and together these define a grid of n-squared
points where the density is estimated.
.AG width
bandwidth of the kernel density estimate. The default is the
``choice of window width for a standard distribution'' from
Silverman 1986 page 86. This rule is 0.96*length(data.x)^(-1/6)
and is probably totally inappropriate - however, it is a quick
and dirty rule.
.AG from
this is a list containing two components: x and y. It specifies the
``bottom-left'' hand coordinate of the evaluation grid. By default this
point is the coordinate obtained by using the smallest data x value,
and the smallest data y value and subtracting off 3 times the window
width.
.AG to
as from, but for the top-right hand corner.
.AG derivatives
some contouring packages can make use of derivative information
to produce nice smooth looking contours.
.RT
A list. The components depend on whether derivatives=T or derivatives=F.
.PP
If derivatives=F, then a list a suitable for passing to "contour"
is produced. The components are:
.RC x
A vector containing the x coordinates of the grid points where the
density is evalualted.
.RC y
As for x, but in the y direction!
.RC z
A matrix of dimension length(x) by length(y) containing the density
estimate. The [i,j]th value contains the estimate for the point (x[i],y[j]).
.RC width
The bandwidth used in computing the density estimate.
Although this may be specified by the user, it is sometimes
computed by the routine and it's useful to know what it is.
.PP
If derivatives=T then the components are the same as for derivatives=F
except z, which is described as follows:
.RC z
An array of dimension length(x) by length(y) by 3 - think of it
as three matrices stacked together. The first matrix [,,1] is simply
the density estimate, the same as z when derivatives=F. The second and
third matrices, [,,2] and [,,3], are the x- and y-derivatives of the estimate.
So, z[i,j,1] is the value of the estimate at (x[i],y[j]),
z[i,j,2] is the x-derivative at (x[i],y[j]) and z[i,j,3] is the
z-derivative. (Bath users: this option should be used for the
"smooth contour plotting" achieved with scontour).
.DT
We implement a two-dimensional kernel density estimation routine.
The formula and important details are described in
Silverman 1986.
.PP
The routine works by putting a two-dimensional bump (kernel)
at each data point. The density estimate at each grid point is computed
by summing the contribution from each bump and dividing by an appropriate
quantity.
.PP
The kde function uses the outer function and so is faster than kde2
which does the same thing with for loops. However, for large numbers
of grid or data points the kde2 routine may be useful as it doesn't
store large matrices.
.SP
.SH BACKGROUND
Density estimation is essentially a smoothing operation.
Inevitably there is a trade-off between bias in the estimate and the
estimate's variability: wide windows will produce smooth estimates that
may hide local features of the density.
.PP
These functions (kde, kde2, dxmake) were originally hacked-up for drawing
density estimates of projected data from exploratory projection pursuit.
.SH REFERENCES
Silverman, B. W. (1986).
.ul
Density Estimation for Statistics and Data Analysis.
Chapman and Hall, London.
.SA
`hist', `ksmooth', `density', `scontour(Bath only)'.
.EX
#
# Have data coordinates in x and y
#
> contour(kde(x,y))
.KW dplot
.KW distribution
.KW smooth
.WR
SHAR_EOF
fi # end of overwriting check
echo shar: extracting "'kde.S'" '(3432 characters)'
if test -f 'kde.S'
then
echo shar: will not over-write existing file "'kde.S'"
else
cat << \SHAR_EOF > 'kde.S'
"kde"<-
function(data.x, data.y, n = 10, width, from, to, derivatives = F)
{
#
# Check that data vectors are the same length
#
if(length(data.x) != length(data.y)) stop(
"Data vectors data.x and data.y must be the same length"
) #
# Find out if we have a bandwidth. If not, choose one according to
# Silverman 1986, page 87
#
if(missing(width))
h <- 0.95999999999999996 * length(data.x)^(-1/6)
else h <- width
rx <- range(data.x)
ry <- range(data.y) #
#
# If we don't have a "from" grid starting point make one up
#
if(missing(from)) from <- list(x = rx[1] - 3 * h, y = ry[1] - 3 * h) #
#
# The same for to
#
if(missing(to)) to <- list(x = rx[2] + 3 * h, y = ry[2] + 3 * h) #
#
# Make up the grid
#
x <- seq(from = from$x, to = to$x, length = n)
y <- seq(from = from$y, to = to$y, length = n) #
#
# Make the answer array
#
m <- array(dim = c(length(x), length(y), 3)) #
#
# Make the (x-Xi)y)/h and (y-Yi)/h values
#
ax <- outer(x, data.x, FUN = "dxmake", h = h)
ay <- outer(y, data.y, FUN = "dxmake", h = h) #
#
# Compute the density estimate and derivatives
#
m[, , 1] <- matrix(dnorm(ax), nrow(ax), ncol(ax)) %*% t(matrix(dnorm(
ay), nrow(ay), ncol(ay)))
if(derivatives == T) {
m[, , 2] <- ( - ax * matrix(dnorm(ax), nrow(ax), ncol(ax))) %*%
t(matrix(dnorm(ay), nrow(ay), ncol(ay)))
m[, , 3] <- matrix(dnorm(ax), nrow(ax), ncol(ax)) %*% t( - ay *
matrix(dnorm(ay), nrow(ay), ncol(ay)))
}
m[, , 1] <- m[, , 1]/(length(data.x) * h^2)
if(derivatives == T) {
m[, , 2] <- m[, , 2]/(length(data.x) * h^3)
m[, , 3] <- m[, , 3]/(length(data.x) * h^3)
}
if(derivatives == F)
return(list(x = x, y = y, z = m[, , 1], width = h))
else if(derivatives == T)
return(list(x = x, y = y, z = m, width = h))
else stop("Don't understand derivatives argument\n")
}
"kde2"<-
function(data.x, data.y, n = 10, width, from, to, derivatives = F)
{
#
# Check that data vectors are the same length
#
if(length(data.x) != length(data.y)) stop(
"Data vectors data.x and data.y must be the same length"
) #
# Find out if we have a bandwidth. If not, choose one according to
# Silverman 1986, page 87
#
if(missing(width))
h <- 0.95999999999999996 * length(data.x)^(-1/6)
else h <- width
rx <- range(data.x)
ry <- range(data.y) #
#
# If we don't have a "from" grid starting point make one up
#
if(missing(from)) from <- list(x = rx[1] - 3 * h, y = ry[1] - 3 * h) #
#
# The same for to
#
if(missing(to)) to <- list(x = rx[2] + 3 * h, y = rx[2] + 3 * h) #
#
# Make up the grid
#
x <- seq(from = from$x, to = to$x, length = n)
y <- seq(from = from$y, to = to$y, length = n) #
#
# Make the answer array
#
m <- array(dim = c(length(x), length(y), 3))
for(i in 1:length(x)) {
dx <- (x[i] - data.x)/h
dnx <- dnorm(dx)
for(j in 1:length(y)) {
dy <- (y[j] - data.y)/h
d <- dnx * dnorm(dy)
m[i, j, 1] <- sum(d)
if(derivatives == T) {
m[i, j, 2] <- sum( - dx * d)
m[i, j, 3] <- sum( - dy * d)
}
}
}
m[, , 1] <- m[, , 1]/(length(data.x) * h^2)
if(derivatives == T) {
m[, , 2] <- m[, , 2]/(length(data.x) * h^3)
m[, , 3] <- m[, , 3]/(length(data.x) * h^3)
}
if(derivatives == F)
return(list(x = x, y = y, z = m[, , 1], width = h))
else if(derivatives == T)
return(list(x = x, y = y, z = m, width = h))
else stop("Don't understand derivatives argument\n")
}
"dxmake"<-
function(x, y, h)
{
(x - y)/h
}
SHAR_EOF
fi # end of overwriting check
# End of shell archive
exit 0