R скрипт для стандартизации данных
# The script is designed to fit community matrices into GBIF sampling event and occurrence templates
# The raw data (=community matrix) should have taxon names in columns and samples as rows
# Templates are available under https://github.com/gbif/ipt/wiki/samplingEventData#templates
# Resulting data frames should be complemented with metadata during publishing through an IPT
# Script contain a set of commands that can be combined and customized depending on dataset
# Author - Anton Potapov
# Version - 1.3 beta (26.09.2019)
# The script can be freely developed and distributed under the CC-BY license (https://creativecommons.org/licenses/by/4.0/)
# You may report bugs or updated versions to potapov.msu@gmail.com so I keep everybody up-to-date
### 1: Install and load packages -----------------------------------------------------------------
# List the packages
used.packages <- c("tidyverse", # a collection of packages, including ggplot, dplyr and more
"reshape2", # data transformation
"reshape", # data transformation
"janitor", # data cleaning
"taxize" # check taxonomic names
)
# Install and/or update all packages (if needed)
lapply(used.packages, install.packages, character.only = TRUE)
lapply(used.packages, update.packages, character.only = TRUE)
# Load all packages in the working space
lapply(used.packages, library, character.only = TRUE)
### 2: Load and check the data -----------------------------------------------------------------
# Load data (replace filename with name of your dataset). You may also replace word 'Dataset' to a more appropriate across the script
Dataset <- read.csv(file = "filename.csv", header = T,
dec = ".", # the decimal point (depends on the system settings)
sep = ",", # the separator symbol (depends on the system settings)
stringsAsFactors=F, # avoid creating factors
strip.white=T) # trim spaces before and after values in text variables
# See if the data was uploaded correctly
# structure of data
str(Dataset)
# summary statistics of data
summary(Dataset)
# explore data (for RStudio)
View(Dataset)
# clean data from empty columns and rows (if necessary)
Dataset <- remove_empty(Dataset, which=c("rows", "cols"))
# check if factor data (text) are consistent: the best is to check one by one each variable in the matrix (except for taxa counts)
# IMPORTANT the format across data in each column should be uniform
# IMPORTANT check spelling mistakes
unique(Dataset$variable1) # unique values; to use, substitute 'variable1' with the variable of interest OR
table(Dataset$variable1) # unique values with counts; yo use, substitute 'variable1' with the variable of interest
# You may correct the dataset here (alternatively, you should correct the initial spreadsheet)
Dataset$variable1[Dataset$variable1=="wrong value"] <- "right value"
# check if numeric data (counts) are consistent: no negative values, no too large values etc.
# here the procedure is simple - check max and min; the procedure can turn to more elaborated later
Dataset %>%
select_if(is.numeric) %>%
summarize_all(list(min, max))
# You may correct the dataset here (alternatively, you should correct the initial spreadsheet)
# put in the right variable1 (taxon name) and substitute "wrong value" with a number (without cuotes)
# in case of systematic mistakes some other functions can be used
Dataset$variable1[Dataset$variable1=="wrong number"] <- "right number"
#Check duplicated samples and remove if any (sample names should be unique!!!)
which(table(Dataset$sample)>1)
Dataset <- Dataset[!duplicated(Dataset$sample), ]
### 3: Sampling events -----------------------------------------------------------------
# first table contains all sampling series (i.e. list of soil cores/pitfall traps/transects) and information abour them
# sampling series can be nested inside each other (here example without nesting)
# select data from the uploaded dataset (select for columns and filter for rows); see dplyr documentation
# "A" is just the object name that is short and thus fast to address
A <- Dataset %>%
select() # example, exclusion of species 3: select(Sample, Habitat, Date, Species1, Species2, -Species3)
filter() # example, select only one habitat and one year: filter(Habitat=="Forest"&Date=="2015")
# Below we will add all necessary columns (all of them and more can be found in the GBIF template and Darwin core standards)
### General info
# Sampling event data type (for community matrices)
A$type <- 'Event'
# Create unique ID of each sample (=sample name); Can be present in the dataset or combined from other columns (as in the example below)
A$eventID <- paste("Name", A$Sample, A$Habitat, A$Date, A$Layer, sep="-") #example
# Each identifier should be unique - two functions should give the same number
length(A$eventID)
length(unique(A$eventID))
# In case of e.g. sampling by layers, each event can be a part of a larger event (layer is a part of soil core)
# For all linked sampling events, coming e.g. from the same soil core, the same parent event should be added (not unique in this table)
A$parentEventID <- paste("Name", A$Sample, A$Habitat, A$Date, sep="-") #example
# Normally, it should be less parent events, than events
length(unique(A$parentEventID))
# Name the dataset
A$datasetName <- 'Bugs from my forest in year 2015'
### Sampling
# Describe sampling protocol. You may provide references
A$samplingProtocol <- "Heat extraction of animals from 8cm2 soil cores, Kuznetsova 2006 doi:10.1016/j.pedobi.2005.12.004"
# Size of the samples (units see below)
A$sampleSizeValue <- 8
# Units of the sample size
A$sampleSizeUnit <- "square_centimetre"
### Time
# date should be put in format YYYY-MM-DD
A$eventDate <- "2015-05-12" # example for one date across events
# if there were several dates and they listed in the table in a wrong format, we can transform them
# for example this transformation for format "DD/MM/YYYY", it replaces parts of the date and put new separator
# to fit your format function needs to be modified; see ?substr
A$eventDate <- paste(substr(A$Date,7,10),substr(A$Date,4,5),substr(A$Date,1,2),sep = '-')
# add year
A$year <- 2015
# add month (1-12)
A$month <- 5
### Location
# for the right formats of the data, see the hints in the GBIF templates / darwin core documentation
# templates are available under https://github.com/gbif/ipt/wiki/samplingEventData#templates
A$country <- 'Russian Federation'
A$countryCode <- 'RU'
A$stateProvince <- 'Moscow region'
# the most precise location name (custom)
A$locality <- 'Village Afanaskino surroundings'
# unique ID of location (for example, internal code that you use for your site)
A$locationID <- 'Afanaskino_ForestB123'
# habitat type
A$habitat <- 'Spruce forest'
# event remarks
A$eventRemarks <- 'Some additional information'
# coordinates in decimal format; it is necessary to provide source of coordinates
A$decimalLatitude <- 55.458087
A$decimalLongitude <- 37.176769
# type of georeference data, see http://rs.tdwg.org/dwc/terms/geodeticDatum
A$geodeticDatum <- 'WGS84'
# data of georeferencing
A$georeferencedDate <- 2015
# source of georeferencing
A$georeferenceSources <- 'Garmin GPSMAP 62'
# precision of georeferencing
A$coordinateUncertaintyInMeters <- 20
### Copyright
# who owns the data
A$rightsHolder <- 'Mister Peabody'
# how the data can be used? Usually creative commons licence: CC-BY = free to use if you provide a reference to original source
A$accessRights <- 'CC-BY'
# how the data should be cited
A$bibliographicCitation <- 'Kuznetsova, N.A., 2006. Long-term dynamics of Collembola in two contrasting ecosystems. Pedobiologia 50, 157–164. doi:10.1016/j.pedobi.2005.12.004'
### Finally select all the columns you want to keep. Add or remove columns according to the information you have
Dataset_Events <- A %>%
select(datasetName,type,eventID,samplingProtocol,sampleSizeValue,sampleSizeUnit,eventDate,year,month,country,countryCode,stateProvince,
locality,locationID,habitat,eventRemarks,decimalLatitude,decimalLongitude,geodeticDatum,georeferenceSources,
coordinateUncertaintyInMeters,rightsHolder,accessRights,bibliographicCitation)
### 3: Occurrences -----------------------------------------------------------------
# second table contains all taxonomic records from the sampling series (i.e. list of organisms and their abundances)
# we continue work with the dataframe 'A' since it has 'eventID' column
# select eventID and all columns with taxon names. Instead of 8 and 60 put in the first and the last number of columns with taxonomic names
A1 <- A %>%
select(eventID,8:60)
# we transform community matrix into the long-format table
A1 <- melt(A1, id.vars = 'eventID')
# remove zeros (we report only presence and not absence of taxa)
A1 <- A1 %>%
filter(value > 0)
### Add basic information
# for the right formats of the data, see the hints in the GBIF templates / darwin core documentation
# templates are available under https://github.com/gbif/ipt/wiki/samplingEventData#templates
# this is just for generation of IDs for each occurrence from the name of event and a number 1,2,3,4,5,...
A1$occurrenceID <- paste(A1$eventID,'-',seq(1:length(A1$eventID)),sep='')
# what the observation is based on?
A1$basisOfRecord <- 'PreservedSpecimen'
# who has found the organism?
A1$recordedBy <- "Mister Peabody"
# who has identified the organism?
A1$identifiedBy <- "Mister Sherman"
# occurrences are linked to sampling events
A1$type <- 'Event'
### Count data
# the third column is the count of individuals (rename)
names(A1)[3] <- 'individualCount'
# also count of individuals, but can be substituted by other measures (e.g. biomass)
A1$organismQuantity <- A1$individualCount
# units of quantity
A1$organismQuantityType <- 'individuals'
# presence or absence of a taxon (we report only presence)
A1$occurrenceStatus <- 'present'
### Taxonomic names
# the second column is taxonomic names (rename)
# A Linnean-style taxonomic name (with or without author)
names(A1)[2] <- 'scientificName'
# CAREFULLY check if all taxonomic names in the table are ok with no typos, no extra symbols etc.
table(A1$scientificName)
# If taxonomic names were exported with "." instead of spaces as separators, they need to be replaced:
A1$scientificName <- gsub("[.]", " ", A1$scientificName)
# if necessary, correct
A1$scientificName[A1$scientificName=='Wrong name'] <- 'Right name'
# You may add some higher-rank hierarchy across all occurrences. At least kingdom and phylum.
# Check the GBIF classification at https://www.gbif.org/species/search
# The classification may be not perfect for your taste. It does not matter as long as it is match a standard system
A1$kingdom <- 'Animalia'
A1$phylum <- 'Arthropoda'
A1$class <- 'Entognatha'
A1$order <- 'Collembola'
# Your data can have species names, genera names, families names as the taxa. This information can be mixed in one dataset
# CAREFULLY check the taxonomic ranks for all your taxa in the table
table(A1$scientificName)
### Check the validity of taxonomic names and add GBIF IDs
# create a list of all taxa in a separate data frame
taxon_list <- aggregate(organismQuantity~scientificName,A1,sum)
# this command from the package 'taxize' will compare all the taxonomic names in the dataset versus GBIF taxonomic backbone
# while running, you may be asked to choose between the matches in the GBIF (type 1 or 2 or 3 or 4... and press enter)
gbif_list <- get_gbifid(taxon_list$scientificName)
# we convert the results in a data frame to work on it
gbif_list <- as.data.frame(gbif_list)
# integrate taxon list and GBIF IDs
taxon_list <- cbind(taxon_list,gbif_list) # combine taxa and IDs
# we have GBIF IDs in this table. Now we will get the correct taxonomic names from the GBIF (this way we also correcting typos)
# this will be done with 'id2name' function and then we will convert resulting table in a data frame
gbif_name <- id2name(taxon_list$ids, db = 'gbif')
gbif_name <- as.data.frame(do.call(rbind.data.frame, gbif_name))
# integrate taxon list and GBIF names
taxon_list <- cbind(taxon_list,gbif_name)
# select only columns that we need: original name (scientificName), GBIF name (name), GBIF ID (uri), taxonomic rank
taxon_list <- taxon_list %>%
select(scientificName,name,uri,rank)
# rename to the right colum names
names(taxon_list) <- c('scientificName','gbifName','taxonID','taxonRank')
# merge back - now we add all the information (IDs, GBIF names and ranks) to the ocurrence table
A1 <- merge(A1,taxon_list,by = 'scientificName')
# Replace original names with GBIF-based
# IMPORTANT!!! Run this step only if you want to replace original names!!! If you want to keep original names - just skip this
# The function will replace original names with gbif names only if they are present, or keep originals, if no gbif name present
A1$scientificName[is.na(A1$gbifName)==F] <- A1$gbifName[is.na(A1$gbifName)==F]
# Remove gbifName from the final table
A1 <- A1 %>%
select(-gbifName)
# Final occurrence table
Dataset_Occurrences <- A1
### 4: Checking and exporting the final tables -----------------------------------------------------------------
### DATA CHECK - events
# types of data: check if the formats are correct
str(Dataset_Events)
# list of all unique entries: check if some entries are strange and correct in the script above or original data
lapply(Dataset_Events,unique)
# check if some events are duplicated
which(table(Kuznetsova_Monitoring_events$eventID)>1)
# check if any columns have missing values
apply(Dataset_Events, 2, function(x) any(is.na(x)))
### DATA CHECK - occurrences
# types of data: check if the formats are correct
str(Dataset_Occurrences)
# list of all unique entries (excluding individual IDs): check if some entries are strange and correct above
lapply(select(Dataset_Occurrences,-occurrenceID),unique)
# check if any columns have missing values
apply(Dataset_Occurrences, 2, function(x) any(is.na(x)))
# check if numbers are adequate (select all numerical variables, calculate min and max)
Dataset_Occurrences %>%
select_if(is.numeric) %>%
summarize_all(list(min, max))
### DATA EXPORT
# write two final tables for upload - events and occurrences.
write.csv(Dataset_Events,"Events_Dataset.csv")
write.csv(Dataset_Occurrences,"Dataset_Occurrences.csv")
# Separately prepare metadata for the dataset (text):
# Dataset name, author, dataset description, geographic scope, taxonomic scope, temporal scope, methods,
# quality control, contacts, references
# Finally, events and occurrences can be publishing through an IPT (https://www.gbif.org/ipt)