From 727e0c16ca6548e2568c8a7f951cec77c25b5ba6 Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:16:02 +0100 Subject: [PATCH 1/9] Put "Reading..." and "... ok" on the same line --- R/get_raw_text_from_xlsx.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_raw_text_from_xlsx.R b/R/get_raw_text_from_xlsx.R index 5dd60b6..84bab25 100644 --- a/R/get_raw_text_from_xlsx.R +++ b/R/get_raw_text_from_xlsx.R @@ -73,7 +73,7 @@ read_sheet_as_text <- function(file, sheet, dbg = TRUE) result <- kwb.utils::catAndRun( messageText = sprintf(" Reading sheet '%s' as raw text", sheet), - dbg = dbg, newLine = 3, expr = { + dbg = dbg, newLine = 2L, expr = { as.matrix(suppressMessages(readxl::read_xlsx( file, sheet, range = range, col_names = FALSE, col_types = "text" ))) From 4dd54b0f99941370bd4b584937691d46b4ccd1d9 Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:17:22 +0100 Subject: [PATCH 2/9] Allow to name list elements by table ids --- R/get_tables_from_xlsx.R | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/R/get_tables_from_xlsx.R b/R/get_tables_from_xlsx.R index 09f42e2..6370a38 100644 --- a/R/get_tables_from_xlsx.R +++ b/R/get_tables_from_xlsx.R @@ -1,6 +1,9 @@ # get_text_tables_from_xlsx ---------------------------------------------------- -get_text_tables_from_xlsx <- function(file, table_info = NULL, dbg = TRUE) +get_text_tables_from_xlsx <- function( + file, table_info = NULL, dbg = TRUE, ids_as_names = FALSE +) { + #kwb.utils::assignPackageObjects("kwb.readxl") #kwb.utils::assignArgumentDefaults(get_text_tables_from_xlsx) # Get one character matrix per sheet @@ -14,7 +17,6 @@ get_text_tables_from_xlsx <- function(file, table_info = NULL, dbg = TRUE) # If there was no metadata file, split sheets into tables and create metadata if (! is.null(table_info)) { - stop("Not implemented: extract tables with known ranges") #extract_tables_with_ranges(text_sheets, table_info) } @@ -30,12 +32,16 @@ get_text_tables_from_xlsx <- function(file, table_info = NULL, dbg = TRUE) result <- do.call(c, all_tables) + # Name the elements in the result list according to the table ids + if (ids_as_names) { + result <- stats::setNames(result, table_info$table_id) + } + structure( result, table_info = table_info, sheet_info = get_sheet_info(text_sheets), file = file - #,names = table_info$table_id ) } From f263dcf6d851cc98fcbfd9110596606d713243d8 Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:18:11 +0100 Subject: [PATCH 3/9] Add read_csv(), write_csv() accepting "lng" arg --- R/utils.R | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/R/utils.R b/R/utils.R index 9be5519..dc025aa 100644 --- a/R/utils.R +++ b/R/utils.R @@ -75,8 +75,33 @@ print_logical_matrix <- function( } } +# read_csv --------------------------------------------------------------------- +read_csv <- function(file, lng = "de") +{ + kwb.utils::callWith( + utils::read.table, + file = file, + kwb.utils::argsCsv(lng), + header = TRUE, + stringsAsFactors = FALSE + ) +} + # stop_formatted --------------------------------------------------------------- stop_formatted <- function(fmt, ...) { stop(sprintf(fmt, ...), call. = FALSE) } + +# write_csv -------------------------------------------------------------------- +write_csv <- function(x, file, lng = "de") +{ + kwb.utils::callWith( + FUN = utils::write.table, + kwb.utils::argsCsv(lng), + x = x, + file = file, + row.names = FALSE, + na = "" + ) +} From 1886cd7f14f7b878a092a75f7554920527dc52d2 Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:20:34 +0100 Subject: [PATCH 4/9] Fix :bug: in print_logical_matrix() --- R/utils.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index dc025aa..f04c5cf 100644 --- a/R/utils.R +++ b/R/utils.R @@ -51,7 +51,7 @@ print_logical_matrix <- function( stopifnot(is.matrix(x)) if (! is.logical(x)) { - x <- as.logical(x) + mode(x) <- "logical" } if (invert) { From 1fc56e5a661278825cab24bf6709ea6ee4440bce Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:21:49 +0100 Subject: [PATCH 5/9] Stop early if no entry is found for "table_id" --- R/metadata.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/metadata.R b/R/metadata.R index 9f367a9..4971660 100644 --- a/R/metadata.R +++ b/R/metadata.R @@ -105,7 +105,7 @@ create_column_metadata <- function( column_infos <- lapply(names(tables), function(table_id) { - # table_id <- names(tables)[1] + #table_id <- names(tables)[1L] debug_formatted( dbg, "Creating column metadata for table '%s'... ", table_id @@ -115,6 +115,12 @@ create_column_metadata <- function( selected <- get_col(table_info, "table_id") == table_id + if (! any(selected)) { + stop_formatted( + "No entry for table_id '%s' found in table_info!", table_id + ) + } + n_headers <- get_col(table_info, "n_headers")[selected] col_types <- get_col(table_info, "col_types")[selected] From dce97a0f14717798e5cc4308a2127ae62157df32 Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:22:46 +0100 Subject: [PATCH 6/9] Accept "lng" arg in metadata export/import --- R/metadata.R | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/R/metadata.R b/R/metadata.R index 4971660..14b1bcb 100644 --- a/R/metadata.R +++ b/R/metadata.R @@ -43,15 +43,24 @@ compact_column_info <- function(column_info) #' Export Table Metadata to CSV file #' #' @param table_info data frame containing metadata about the tables. +#' @param col_types if \code{TRUE} (default), a column with the guessed column +#' types is included in the created file. +#' @param lng one of "de" (German) or "en" (English) having an effect on column +#' and decimal separator characters. #' @param dbg logical. If \code{TRUE} (default), debug messages are shown. #' #' @keywords internal #' -export_table_metadata <- function(table_info, dbg = TRUE) { +export_table_metadata <- function(table_info, col_types = TRUE, lng = "de", dbg = TRUE) +{ stopifnot(is.data.frame(table_info)) kwb.utils::checkForMissingColumns(table_info, c("table_id", "table_name")) + if (! isTRUE(col_types)) { + table_info <- kwb.utils::removeColumns(table_info, "col_types") + } + file_xlsx <- kwb.utils::getAttribute(table_info, "file") file_csv <- paste0(kwb.utils::removeExtension(file_xlsx), "_META_tmp.csv") @@ -60,29 +69,29 @@ export_table_metadata <- function(table_info, dbg = TRUE) { debug_file(dbg, file_csv) - utils::write.csv(table_info, file = file_csv, row.names = FALSE, na = "") + write_csv(table_info, file_csv, lng) debug_ok(dbg) } # import_table_metadata -------------------------------------------------------- -import_table_metadata <- function(file, dbg = TRUE) { +import_table_metadata <- function(file, lng = "de", dbg = TRUE) +{ file_csv <- paste0(kwb.utils::removeExtension(file), "_META.csv") - if (file.exists(file_csv)) { - debug_formatted( - dbg, "Reading table metadata from\n '%s' ... ", basename(file_csv) - ) - - table_info <- utils::read.csv(file_csv, stringsAsFactors = FALSE) - - debug_ok(dbg) - } else { + if (! file.exists(file_csv)) { debug_formatted(dbg, "No metadata file found for this Excel file.\n") - - table_info <- NULL + return(NULL) } - + + debug_formatted( + dbg, "Reading table metadata from\n '%s' ... ", basename(file_csv) + ) + + table_info <- read_csv(file_csv, lng = lng) + + debug_ok(dbg) + table_info } @@ -91,7 +100,8 @@ create_column_metadata <- function( tables, table_info = attr(tables, "table_info"), dbg = TRUE ) { - # kwb.utils::assignArgumentDefaults("create_column_metadata") + #kwb.utils::assignPackageObjects("kwb.readxl") + #kwb.utils::assignArgumentDefaults("create_column_metadata") get_col <- kwb.utils::selectColumns From ea2b442b8f9572e03dfa8e3b02ef7117e506f1bc Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 14 Feb 2022 19:23:27 +0100 Subject: [PATCH 7/9] Make test script run, apply to abluft2 labdata --- R/tests.R | 108 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 67 insertions(+), 41 deletions(-) diff --git a/R/tests.R b/R/tests.R index 7471573..9d4d6c8 100644 --- a/R/tests.R +++ b/R/tests.R @@ -1,3 +1,6 @@ +# Provide paths to Excel files +files <- dir("~/rawdata/abluft2/labdata/xlsx", "xlsx$", full.names = TRUE) + # M A I N ---------------------------------------------------------------------- if (FALSE) { @@ -5,29 +8,31 @@ if (FALSE) # 2. Source this script file_database <- kwb.file::to_file_database(files) - file_database$files - file_database$folders - + # Get all tables from one file - tables <- get_text_tables_from_xlsx(file = files[1]) + tables <- kwb.readxl:::get_text_tables_from_xlsx( + file = files[1L], + ids_as_names = TRUE # to name elements according to table_ids (below)! + ) + + # Get table metadata (just read an attribute) + table_info <- kwb.readxl:::get_table_info(tables) - # Get table metadata - table_info <- get_table_info(tables) - # Create column metadata from the table headers - column_info <- create_column_metadata(tables) + column_info <- kwb.readxl:::create_column_metadata(tables) # Add column "skip": If the user puts an "x" into this column, the # corresponding table will not be imported. table_info$skip <- "" + # Set "file" attribute in table_info + attr(table_info, "file") <- kwb.utils::getAttribute(tables, "file") + # Write table metadata to "_META.csv" - export_table_metadata( - structure(table_info, file = kwb.utils::getAttribute(tables, "file")) - ) + kwb.readxl:::export_table_metadata(table_info, col_types = FALSE) # import_table_metadata returns NULL if no metadata file exists - import_table_metadata(files[5]) + kwb.readxl:::import_table_metadata(files[5]) # Select all file indices indices <- seq_along(files) @@ -42,7 +47,7 @@ if (FALSE) # Get all tables from all files system.time(all_tables <- lapply(indices, function(index) { cat("File index:", index) - get_text_tables_from_xlsx(files[index]) + kwb.readxl:::get_text_tables_from_xlsx(files[index], ids_as_names = TRUE) })) # user system elapsed @@ -51,19 +56,20 @@ if (FALSE) names(all_tables) <- file_database$files$file_id[indices] # Create column metadata for all tables - column_info_list <- lapply(all_tables, create_column_metadata) + column_info_list <- lapply(all_tables, kwb.readxl:::create_column_metadata) - column_info <- rbindAll( + column_info <- kwb.utils::rbindAll( column_info_list, - nameColumn = "file_id", namesAsFactor = FALSE + nameColumn = "file_id", + namesAsFactor = FALSE ) - x <- compact_column_info(column_info) + x <- kwb.readxl:::compact_column_info(column_info) nrow(x) # 6141 - column_info <- suggest_column_name(column_info) + column_info <- kwb.readxl:::suggest_column_name(column_info) column_info <- merge(column_info, file_database$files) @@ -73,15 +79,18 @@ if (FALSE) file_metadata <- file.path(base_dir, "METADATA_columns_tmp.csv") - write.csv(column_info, file_metadata, row.names = FALSE) + kwb.readxl:::write_csv(column_info, file_metadata) # TODO: Rename METADATA_columns_tmp.csv to METADATA_columns.csv, let the user # modify the file and read back into column_info # - #column_info <- read_column_info(safePath(base_dir, "METADATA_columns.csv")) - + #column_info <- kwb.readxl:::read_column_info(safePath(base_dir, "METADATA_columns.csv")) + column_info <- kwb.readxl:::read_csv( + kwb.utils::safePath(base_dir, "METADATA_columns.csv") + ) + # Use column info to convert the text tables into data frames - all_data <- text_matrices_to_data_frames(all_tables, column_info) + all_data <- kwb.readxl:::text_matrices_to_data_frames(all_tables, column_info) lapply(all_data, function(all_tables) lapply(all_tables, utils::head)) @@ -93,17 +102,21 @@ if (FALSE) # Write a summary of the read structure to the log file utils::capture.output(file = logfile_summary, { - for (tables in all_tables) print_table_summary(tables) + for (tables in all_tables) { + kwb.readxl:::print_table_summary(tables) + } }) + #kwb.utils::hsOpenWindowsExplorer(dirname(logfile_summary)) + # Let's have a look at the tables in one Excel file only - tables <- all_tables[[1]] + tables <- all_tables[[1L]] # Get a description of the sheets in that file - get_sheet_info(tables) + kwb.readxl:::get_sheet_info(tables) # Get a description of tables in that file - get_table_info(tables) + kwb.readxl:::get_table_info(tables) # Get the name of the file that was read kwb.utils::getAttribute(tables, "file") @@ -115,31 +128,38 @@ if (FALSE) # Try to guess the header rows for each table... n_headers <- sapply(names(tables), function(table_id) { - guess_number_of_headers_from_text_matrix( + kwb.readxl:::guess_number_of_headers_from_text_matrix( tables[[table_id]], table_id = table_id ) }) - print_logical_matrix(guess_header_matrix(x = tables$table_01_01, n_max)) - print_logical_matrix(guess_header_matrix(x = tables$table_02_01, n_max)) + n_max = 10L + + print_guess <- function(x) { + kwb.readxl:::print_logical_matrix(x = guess_header_matrix(x, n_max)) + } + + print_guess(x = tables$table_01_01) + print_guess(x = tables$table_02_01) print_header_guesses(tables, n_max, file = logfile_headers) - lapply(all_tables[[3]], guess_header_matrix, 4) + lapply(all_tables[[2L]], guess_header_matrix, 4L) utils::head(x <- tables$table_02_01) is_empty <- (is.na(x) | x == "") - print_logical_matrix(utils::head(is_empty)) - print_logical_matrix(utils::head(is_empty), invert = TRUE) + kwb.readxl:::print_logical_matrix(utils::head(is_empty)) + kwb.readxl:::print_logical_matrix(utils::head(is_empty), invert = TRUE) } # Text Matrices to data frames ------------------------------------------------- -if (FALSE) { +if (FALSE) +{ # Convert text matrices of known format - tables <- get_text_tables_from_xlsx(file = files[1]) + tables <- kwb.readxl:::get_text_tables_from_xlsx(file = files[1L]) selected <- grepl("^table_02_", names(tables)) @@ -153,29 +173,35 @@ if (FALSE) { # print_header_guesses --------------------------------------------------------- print_header_guesses <- function( - text_matrices, n_max = 5, file = NULL, dbg = TRUE) { - if (!is.null(file)) { - debug_formatted(dbg, "Writing output to '%s'... ", file) + text_matrices, n_max = 5L, file = NULL, dbg = TRUE +) +{ + if (! is.null(file)) { + + kwb.readxl:::debug_formatted(dbg, "Writing output to '%s'... ", file) utils::capture.output(file = file, print_header_guesses(text_matrices, n_max)) - debug_ok(dbg) + kwb.readxl:::debug_ok(dbg) + } else { # matrix_name <- "table_03_01" for (matrix_name in names(text_matrices)) { + header <- guess_header_matrix(x = text_matrices[[matrix_name]], n_max) - debug_formatted(dbg, "\n%s:\n", matrix_name) + kwb.readxl:::debug_formatted(dbg, "\n%s:\n", matrix_name) - print_logical_matrix(header) + kwb.readxl:::print_logical_matrix(header) } } } # guess_header_matrix ---------------------------------------------------------- -guess_header_matrix <- function(x, n_max = 10) { +guess_header_matrix <- function(x, n_max = 10L) +{ stopifnot(is.character(x)) kwb.utils::stopIfNotMatrix(x) From b491c22065bed3d92221807c0a13b1ba705dd24d Mon Sep 17 00:00:00 2001 From: hsonne Date: Tue, 15 Feb 2022 10:22:14 +0100 Subject: [PATCH 8/9] Add argument "na.strings" to read_csv() --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index f04c5cf..165333e 100644 --- a/R/utils.R +++ b/R/utils.R @@ -76,13 +76,14 @@ print_logical_matrix <- function( } # read_csv --------------------------------------------------------------------- -read_csv <- function(file, lng = "de") +read_csv <- function(file, lng = "de", na.strings = "") { kwb.utils::callWith( utils::read.table, file = file, kwb.utils::argsCsv(lng), header = TRUE, + na.strings = na.strings, stringsAsFactors = FALSE ) } From d4ff06cd20a63c9f50d2c3878a74f55a3ceff6b9 Mon Sep 17 00:00:00 2001 From: hsonne Date: Mon, 6 Jun 2022 14:51:09 +0200 Subject: [PATCH 9/9] Replace 1:i with seq_len(i) found with extdata/cleanCodeBasics.R in kwb.code! --- R/tests.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tests.R b/R/tests.R index 9d4d6c8..66f7025 100644 --- a/R/tests.R +++ b/R/tests.R @@ -210,7 +210,7 @@ guess_header_matrix <- function(x, n_max = 10L) do.call(cbind, lapply(x_head, function(column_values) { sapply(seq_along(column_values), function(i) { - as.integer(!(column_values[i] %in% column_values[-(1:i)])) + as.integer(!(column_values[i] %in% column_values[-seq_len(i)])) }) })) }