R скрипт для стандартизации данных

R скрипт для стандартизации данных 

загрузить в виде файла

# The script is designed to fit community matrices into GBIF sampling event and occurrence templates
# The raw data (=community matrix) should have taxon names in columns and samples as rows
# Templates are available under https://github.com/gbif/ipt/wiki/samplingEventData#templates
# Resulting data frames should be complemented with metadata during publishing through an IPT
# Script contain a set of commands that can be combined and customized depending on dataset

# Author - Anton Potapov
# Version - 1.3 beta (26.09.2019)
# The script can be freely developed and distributed under the CC-BY license (https://creativecommons.org/licenses/by/4.0/)
# You may report bugs or updated versions to potapov.msu@gmail.com so I keep everybody up-to-date


### 1: Install and load packages -----------------------------------------------------------------

# List the packages
used.packages <- c("tidyverse", # a collection of packages, including ggplot, dplyr and more
                   "reshape2", # data transformation
                   "reshape", # data transformation
                   "janitor", # data cleaning
                   "taxize" # check taxonomic names
)

# Install and/or update all packages (if needed)
lapply(used.packages, install.packages, character.only = TRUE)
lapply(used.packages, update.packages, character.only = TRUE)

# Load all packages in the working space
lapply(used.packages, library, character.only = TRUE)

### 2: Load and check the data -----------------------------------------------------------------

# Load data (replace filename with name of your dataset). You may also replace word 'Dataset' to a more appropriate across the script
Dataset <- read.csv(file = "filename.csv", header = T, 
                                                 dec = ".",  # the decimal point (depends on the system settings)
                                                 sep = ",", # the separator symbol (depends on the system settings)
                                                 stringsAsFactors=F, # avoid creating factors
                                                 strip.white=T) # trim spaces before and after values in text variables

# See if the data was uploaded correctly
# structure of data
str(Dataset)

# summary statistics of data
summary(Dataset) 

# explore data (for RStudio)
View(Dataset)

# clean data from empty columns and rows (if necessary)
Dataset <- remove_empty(Dataset, which=c("rows", "cols"))

# check if factor data (text) are consistent: the best is to check one by one each variable in the matrix (except for taxa counts)
# IMPORTANT the format across data in each column should be uniform
# IMPORTANT check spelling mistakes
unique(Dataset$variable1) # unique values; to use, substitute 'variable1' with the variable of interest OR
table(Dataset$variable1) # unique values with counts; yo use, substitute 'variable1' with the variable of interest

# You may correct the dataset here (alternatively, you should correct the initial spreadsheet)
Dataset$variable1[Dataset$variable1=="wrong value"] <- "right value"

# check if numeric data (counts) are consistent: no negative values, no too large values etc.
# here the procedure is simple - check max and min; the procedure can turn to more elaborated later
Dataset %>%
  select_if(is.numeric) %>%
  summarize_all(list(min, max))

# You may correct the dataset here (alternatively, you should correct the initial spreadsheet)
# put in the right variable1 (taxon name) and substitute "wrong value" with a number (without cuotes)
# in case of systematic mistakes some other functions can be used
Dataset$variable1[Dataset$variable1=="wrong number"] <- "right number" 

#Check duplicated samples and remove if any (sample names should be unique!!!)
which(table(Dataset$sample)>1)
Dataset <- Dataset[!duplicated(Dataset$sample), ]


### 3: Sampling events  -----------------------------------------------------------------
# first table contains all sampling series (i.e. list of soil cores/pitfall traps/transects) and information abour them
# sampling series can be nested inside each other (here example without nesting)
# select data from the uploaded dataset (select for columns and filter for rows); see dplyr documentation
# "A" is just the object name that is short and thus fast to address

A <- Dataset %>%
  select() # example, exclusion of species 3: select(Sample, Habitat, Date, Species1, Species2, -Species3) 
  filter() # example, select only one habitat and one year: filter(Habitat=="Forest"&Date=="2015")

# Below we will add all necessary columns (all of them and more can be found in the GBIF template and Darwin core standards)

### General info
# Sampling event data type (for community matrices)
A$type <- 'Event' 

# Create unique ID of each sample (=sample name); Can be present in the dataset or combined from other columns (as in the example below)
A$eventID <- paste("Name", A$Sample, A$Habitat, A$Date, A$Layer, sep="-") #example

# Each identifier should be unique - two functions should give the same number
length(A$eventID)
length(unique(A$eventID))

# In case of e.g. sampling by layers, each event can be a part of a larger event (layer is a part of soil core)
# For all linked sampling events, coming e.g. from the same soil core, the same parent event should be added (not unique in this table)
A$parentEventID <- paste("Name", A$Sample, A$Habitat, A$Date, sep="-") #example

# Normally, it should be less parent events, than events
length(unique(A$parentEventID))

# Name the dataset
A$datasetName <- 'Bugs from my forest in year 2015' 


### Sampling
# Describe sampling protocol. You may provide references
A$samplingProtocol <- "Heat extraction of animals from 8cm2 soil cores, Kuznetsova 2006 doi:10.1016/j.pedobi.2005.12.004"

# Size of the samples (units see below)
A$sampleSizeValue <- 8 

# Units of the sample size
A$sampleSizeUnit <- "square_centimetre"


### Time
# date should be put in format YYYY-MM-DD
A$eventDate <- "2015-05-12" # example for one date across events
  
# if there were several dates and they listed in the table in a wrong format, we can transform them
# for example this transformation for format "DD/MM/YYYY", it replaces parts of the date and put new separator
# to fit your format function needs to be modified; see ?substr
A$eventDate <- paste(substr(A$Date,7,10),substr(A$Date,4,5),substr(A$Date,1,2),sep = '-') 

# add year
A$year <- 2015

# add month (1-12)
A$month <- 5


### Location
# for the right formats of the data, see the hints in the GBIF templates / darwin core documentation
# templates are available under https://github.com/gbif/ipt/wiki/samplingEventData#templates
A$country <- 'Russian Federation' 
A$countryCode <- 'RU'
A$stateProvince <- 'Moscow region'

# the most precise location name (custom)
A$locality <- 'Village Afanaskino surroundings' 

# unique ID of location (for example, internal code that you use for your site)
A$locationID <- 'Afanaskino_ForestB123' 

# habitat type
A$habitat <- 'Spruce forest'

# event remarks
A$eventRemarks <- 'Some additional information'

# coordinates in decimal format; it is necessary to provide source of coordinates
A$decimalLatitude <- 55.458087
A$decimalLongitude <- 37.176769

# type of georeference data, see http://rs.tdwg.org/dwc/terms/geodeticDatum
A$geodeticDatum <- 'WGS84'

# data of georeferencing 
A$georeferencedDate <- 2015

# source of georeferencing 
A$georeferenceSources <- 'Garmin GPSMAP 62'

# precision of georeferencing 
A$coordinateUncertaintyInMeters <- 20


### Copyright
# who owns the data
A$rightsHolder <- 'Mister Peabody' 

# how the data can be used? Usually creative commons licence: CC-BY = free to use if you provide a reference to original source
A$accessRights <- 'CC-BY' 

# how the data should be cited
A$bibliographicCitation <- 'Kuznetsova, N.A., 2006. Long-term dynamics of Collembola in two contrasting ecosystems. Pedobiologia 50, 157–164. doi:10.1016/j.pedobi.2005.12.004'


### Finally select all the columns you want to keep. Add or remove columns according to the information you have
Dataset_Events <- A %>% 
  select(datasetName,type,eventID,samplingProtocol,sampleSizeValue,sampleSizeUnit,eventDate,year,month,country,countryCode,stateProvince,
         locality,locationID,habitat,eventRemarks,decimalLatitude,decimalLongitude,geodeticDatum,georeferenceSources,
         coordinateUncertaintyInMeters,rightsHolder,accessRights,bibliographicCitation)

### 3: Occurrences  -----------------------------------------------------------------
# second table contains all taxonomic records from the sampling series (i.e. list of organisms and their abundances)

# we continue work with the dataframe 'A' since it has 'eventID' column
# select eventID and all columns with taxon names. Instead of 8 and 60 put in the first and the last number of columns with taxonomic names
A1 <- A %>% 
  select(eventID,8:60)

# we transform community matrix into the long-format table
A1 <- melt(A1, id.vars = 'eventID')

# remove zeros (we report only presence and not absence of taxa)
A1 <- A1 %>% 
  filter(value > 0)

### Add basic information
# for the right formats of the data, see the hints in the GBIF templates / darwin core documentation
# templates are available under https://github.com/gbif/ipt/wiki/samplingEventData#templates

# this is just for generation of IDs for each occurrence from the name of event and a number 1,2,3,4,5,...
A1$occurrenceID <- paste(A1$eventID,'-',seq(1:length(A1$eventID)),sep='')

# what the observation is based on?
A1$basisOfRecord <- 'PreservedSpecimen' 

# who has found the organism? 
A1$recordedBy <- "Mister Peabody"

# who has identified the organism?
A1$identifiedBy <- "Mister Sherman"

# occurrences are linked to sampling events
A1$type <- 'Event'


### Count data
# the third column is the count of individuals (rename)
names(A1)[3] <- 'individualCount'

# also count of individuals, but can be substituted by other measures (e.g. biomass)
A1$organismQuantity <- A1$individualCount

# units of quantity
A1$organismQuantityType <- 'individuals'

# presence or absence of a taxon (we report only presence)
A1$occurrenceStatus <- 'present'


### Taxonomic names
# the second column is taxonomic names (rename)
# A Linnean-style taxonomic name (with or without author)
names(A1)[2] <- 'scientificName'

# CAREFULLY check if all taxonomic names in the table are ok with no typos, no extra symbols etc.
table(A1$scientificName) 

# If taxonomic names were exported with "." instead of spaces as separators, they need to be replaced:
A1$scientificName <- gsub("[.]", " ", A1$scientificName)

# if necessary, correct
A1$scientificName[A1$scientificName=='Wrong name'] <- 'Right name'

# You may add some higher-rank hierarchy across all occurrences. At least kingdom and phylum. 
# Check the GBIF classification at https://www.gbif.org/species/search
# The classification may be not perfect for your taste. It does not matter as long as it is match a standard system
A1$kingdom <- 'Animalia'
A1$phylum <- 'Arthropoda'
A1$class <- 'Entognatha'
A1$order <- 'Collembola'

# Your data can have species names, genera names, families names as the taxa. This information can be mixed in one dataset
# CAREFULLY check the taxonomic ranks for all your taxa in the table
table(A1$scientificName) 


### Check the validity of taxonomic names and add GBIF IDs
# create a list of all taxa in a separate data frame
taxon_list <- aggregate(organismQuantity~scientificName,A1,sum) 

# this command from the package 'taxize' will compare all the taxonomic names in the dataset versus GBIF taxonomic backbone
# while running, you may be asked to choose between the matches in the GBIF (type 1 or 2 or 3 or 4... and press enter)
gbif_list <- get_gbifid(taxon_list$scientificName)

# we convert the results in a data frame to work on it
gbif_list <- as.data.frame(gbif_list) 

# integrate taxon list and GBIF IDs
taxon_list <- cbind(taxon_list,gbif_list) # combine taxa and IDs

# we have GBIF IDs in this table. Now we will get the correct taxonomic names from the GBIF (this way we also correcting typos)
# this will be done with 'id2name' function and then we will convert resulting table in a data frame
gbif_name <- id2name(taxon_list$ids, db = 'gbif')
gbif_name <- as.data.frame(do.call(rbind.data.frame, gbif_name)) 

# integrate taxon list and GBIF names
taxon_list <- cbind(taxon_list,gbif_name) 

# select only columns that we need: original name (scientificName), GBIF name (name), GBIF ID (uri), taxonomic rank 
taxon_list <- taxon_list %>% 
  select(scientificName,name,uri,rank)

# rename to the right colum names
names(taxon_list) <- c('scientificName','gbifName','taxonID','taxonRank')

# merge back - now we add all the information (IDs, GBIF names and ranks) to the ocurrence table
A1 <- merge(A1,taxon_list,by = 'scientificName')

# Replace original names with GBIF-based
# IMPORTANT!!! Run this step only if you want to replace original names!!! If you want to keep original names - just skip this
# The function will replace original names with gbif names only if they are present, or keep originals, if no gbif name present
A1$scientificName[is.na(A1$gbifName)==F] <- A1$gbifName[is.na(A1$gbifName)==F]

# Remove gbifName from the final table
A1 <- A1 %>% 
  select(-gbifName)

# Final occurrence table
Dataset_Occurrences <- A1 


### 4: Checking and exporting the final tables -----------------------------------------------------------------

### DATA CHECK - events
# types of data: check if the formats are correct
str(Dataset_Events)

# list of all unique entries: check if some entries are strange and correct in the script above or original data
lapply(Dataset_Events,unique)

# check if some events are duplicated
which(table(Kuznetsova_Monitoring_events$eventID)>1)

# check if any columns have missing values
apply(Dataset_Events, 2, function(x) any(is.na(x)))


### DATA CHECK - occurrences
# types of data: check if the formats are correct
str(Dataset_Occurrences)

# list of all unique entries (excluding individual IDs): check if some entries are strange and correct above 
lapply(select(Dataset_Occurrences,-occurrenceID),unique)

# check if any columns have missing values
apply(Dataset_Occurrences, 2, function(x) any(is.na(x)))

# check if numbers are adequate (select all numerical variables, calculate min and max)
Dataset_Occurrences %>%
  select_if(is.numeric) %>%
  summarize_all(list(min, max))


### DATA EXPORT
# write two final tables for upload - events and occurrences. 

write.csv(Dataset_Events,"Events_Dataset.csv")
write.csv(Dataset_Occurrences,"Dataset_Occurrences.csv")

# Separately prepare metadata for the dataset (text):
# Dataset name, author, dataset description, geographic scope, taxonomic scope, temporal scope, methods,
# quality control, contacts, references
# Finally, events and occurrences can be publishing through an IPT (https://www.gbif.org/ipt)