# Uploading data

### **Create a new dataset**

```r
library(redivis)

# Could also create a dataset under an organization:
# dataset <- redivis$organization("Demo organization")$dataset("some dataset")
dataset <- redivis$user("your-username")$dataset("some dataset")

# public_access_level can be one of ('none', 'overview', 'metadata', 'sample', 'data')
dataset$create(public_access_level="overview")
```

### **Create a table and upload data**

```r
library(redivis)

dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")

# Create a table on the dataset. Datasets may have multiple tables
table = (
    dataset
    $table("Table name")
    $create(description="Some description")
)

# Upload a file to the table. 
# You can create multiple uploads per table, in which case they'll be appended together.
upload = table$upload()$create(
    "./data.csv",           # Path to file, data.frame, raw vector, etc
    type="delimited",       # Inferred from file extension if not provided
    remove_on_fail=TRUE,    # Remove the upload if a failure occurs
    wait_for_finish=TRUE,   # Wait for the upload to finish processing
    raise_on_fail=TRUE      # Raise an error on failure
)
```

### **Upload non-tabular (unstructured) files**

```r
library(redivis)
dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")

# Non-tabular files must be uploaded to file index tables
table <- dataset$table("my_files")$create(is_file_index=TRUE)

# upload all contents in a directory
table$add_files(directory="/path/to/directory/")

# upload specific files
table$add_files(files=list(
    list(path="/path/to/file.png"), # file name will be "file.png"
    list(path="/path/to/other/file.png", name="other_file.png"), # file name will be other_file.png
    list(data="Hello world", name="hello_world.txt") # Data can be string or raw vector 
    list(data=url("http://example.com"), name="example_com.html") # Data can be a connection
)
```

### **Upload data from an external source**

```r
# Assuming we get a reference to the table the same as above...

upload <- table$upload("data.csv")

upload$create(
    transfer_specification=list(
        sourceType="gcs", # one of gcs, s3, bigQuery, url, redivis
        sourcePath="my-bucket/path-to-my-file.csv", 
        # sourcePath="https://example.com/data-file", (for sourceType == "url")
        # sourcePath="workflow_name.dataset_name.table_name", (for sourceType == "bigQuery")
        # sourcePath="owner_name.dataset_or_workflow_name.table_name", (for sourceType == "redivis")
        identity="my_email@example.com" # The email associated with the data source
    ),
)
```

### Stream data to an upload

```r
library(redivis)

dataset <- redivis$user("user_name")$dataset("dataset_name", version="next")
table <- dataset$table("table_name")

# Providing a schema with the initial request is optional (but recommended).
# If not set, schema will be inferred based on the first batch of rows.
schema <- list(
  list(name = "var1", type = "string"),
  list(name = "var2", type = "integer"),
  list(name = "var3", type = "dateTime")
)

# Construct a data.frame to send (or alternatively, a stringified JSON array of objects)
rows <- data.frame(
  var1 = c("hello", "world"),
  var2 = c(1, 2),
  var3 = c(NA, "2020-01-01T00:00:00.123")
)

upload <- table$upload(name="my_stream")

# Create, or get a reference to an exisiting upload named "my_stream"
upload.create(type="stream", schema=schema, if_not_exists=True) 

insert_response = upload.insert_rows(rows)

# See REST API / uploads / insertRows
print(insert_response)
```

### **Release a new version**

```r
library(redivis)

dataset <- redivis$user("username")$dataset("some dataset", version="next")
dataset$release()
```

### **Create a subsequent version on an existing dataset**

```r
library(redivis)

dataset <- redivis$user("your-username")$dataset("some dataset")

# dataset$create_next_version will throw an error if a "next" version already exists,
# unless the ignore_if_exists argument is provided
dataset <- dataset$create_next_version(ignore_if_exists=TRUE)
table <- dataset$table("table name")

# By default, all new data is appended to the previous version of a table. 
# If you'd like to replace the previous data, update the upload_merge_strategy.
table$update(upload_merge_strategy="replace")

upload <- table$upload("data.csv")$create(
    "./data.csv",           # Path to file, data.frame, raw vector, etc
    # All additional params are optional; default values are shown here
    type="delimited",       # One of stream, delimited, csv, ndjson, avro, parquet, orc, xls, xlsx, dta, sas7bdat, sav
    skip_bad_records=FALSE,      
    has_header_row=TRUE,    # Only relevant for csv, xls(x)
    remove_on_fail=TRUE,    # Remove the upload if a failure occurs
    wait_for_finish=TRUE,   # Wait for the upload to finish processing
    raise_on_fail=TRUE      # Raise an error on failure
    
    # The following are only relevant for delimited files:
    allow_quoted_newlines=FALSE, # Allow newlines within cells. Setting to True will substantially reduce ingest performance.
    quote_character='"',         # The character used to escape delimiters within cells. Generally a double quote in compliant CSVs.
    delimiter=NULL,              # For delimited files, explicitly set the delimiter, otherwise the delimiter will be automatically inferred.
)

# When all uploads have finished, release the next version
dataset$release()
```
