+

Duncan Temple Lang

University of California at Davis

Department of Statistics


+
library(XML)
dir = "~/Books/XMLTechnologies/XSL/examples/Earthquakes/"
doc = xmlParse(sprintf("%s%s", dir, "2004_Earthquakes_ALL.kml"))

We get all the <Placemark> nodes under the <Folder> nodes.

+
pl = getNodeSet(doc, "//kml:Folder/kml:Placemark", "kml")

Now we extract the longitude and latitude, the magnitude, etc.

Before we do this, let's just test one thing. Which is faster: getting the <Placemark> nodes once and then working on these to extract their children in R, or to use XPath many times to get each of the different elements in the document that we want, <coordinates>, <description>, <name> within in each <Placemark>. Let's time these

+
system.time({desc = sapply(pl, function(x) xmlValue(x[["description"]]))})
   user  system elapsed 
 18.411   0.972  35.986 
system.time({desc1 = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:description", xmlValue, namespaces = "kml")})
   user  system elapsed 
  0.711   0.052   1.262 
all(desc1 == desc)

So it is much faster (in this context) to use XPath many times rather than using R to access the children by name. Accessing by index may be different.

+
system.time({desc = sapply(pl, function(x) xmlValue(x[[2]]))})
   user  system elapsed 
 10.201   0.697  17.039 

Reading one File

+
nm = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:name", xmlValue, namespaces = "kml")
#nm = sapply(pl, function(x) xmlValue(x[["name"]]))
mags = gsub("^M (None|[0-9.]+) -.*", "\\1", nm)
mags[mags == "None"] = "0"
magnitude = as.numeric(mags)
# Check the results
sum(is.na(magnitude)) == 0
table(floor(magnitude))

Now get the depth from the HTML content in the description.

+
desc = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:description", xmlValue, namespaces = "kml")
# desc = sapply(pl, function(x) xmlValue(x[["description"]]))
getDepthStringFromDesc = 
function(x) { 
  hdoc = htmlParse(x, asText = TRUE)
  xmlValue(getNodeSet(hdoc, "//tr[2]/td/font")[[1]])
}
str = sapply(desc, getDepthStringFromDesc)
depth = as.numeric(gsub(" .*", "", str)) # strip names
sum(is.na(depth))

Let's get the date

+
dates = unlist(getNodeSet(doc, "//kml:Folder/kml:Placemark/@id", "kml"))
dates = as.POSIXct(strptime(dates, "%Y %b %d %H:%M:%S %Z"))

Now the longitude and latitude

+
coords = xpathSApply(doc, "//kml:Folder/kml:Placemark/kml:Point/kml:coordinates", xmlValue, namespaces = "kml")
locations = t(sapply(strsplit(coords, ","), function(x) x[1:2]))

Now we assemble the data frame

+
eq = data.frame(dates = dates, depth = depth, magnitude = magnitude)
eq$longitude = as.numeric(locations[,1])
eq$latitude = as.numeric(locations[,2])

readKMZEarthquakes =
function(doc)
{
 if(is.character(doc)) {
   if(length(grep("^http.*kmz$", doc))) {
       doc = xmlParse(getKMZURL(doc), asText = TRUE)
   } else
       doc = xmlParse(doc)
 }

cat("parsed the XML file\n")
 
  <<magnitude>>
 
 
  <<dates>>
 
 
  <<description>>
 
 
  <<coordinates>>
 
 
  <<assemble>>
 
 eq
}


getKMZURL =
function(doc, readXML = TRUE)
{
  kmz = getURLContent('http://neic.usgs.gov/neis/epic/kml/2009_Earthquakes_ALL.kmz')
  ar = zipArchive(kmz)
  if(readXML)
    invisible(xmlParse(ar[[1]], asText = TRUE))
   else
     ar
}