JuliaData · bkamins · Dec 28, 2022 · Dec 28, 2022 · Jan 5, 2023 · Jan 5, 2023
diff --git a/docs/src/lib/functions.md b/docs/src/lib/functions.md
@@ -85,6 +85,7 @@ insertcols!
 invpermute!
 mapcols
 mapcols!
+nest
 permute!
 prepend!
 push!
@@ -102,6 +103,7 @@ table_transformation
 transform
 transform!
 vcat
+unnest
 ```
 
 ## Reshaping data frames between tall and wide formats

diff --git a/src/DataFrames.jl b/src/DataFrames.jl
@@ -76,6 +76,7 @@ export AbstractDataFrame,
  mapcols,
  mapcols!,
  ncol,
+ nest,
  nonunique,
  nrow,
  order,
@@ -95,6 +96,7 @@ export AbstractDataFrame,
  transform,
  transform!,
  unique!,
+ unnest,
  unstack,
  valuecols,
  metadata,
@@ -166,6 +168,7 @@ include("abstractdataframe/show.jl")
 include("groupeddataframe/show.jl")
 include("dataframerow/show.jl")
 include("abstractdataframe/io.jl")
+include("abstractdataframe/nest.jl")
 
 include("other/tables.jl")
 include("other/names.jl")

diff --git a/src/abstractdataframe/abstractdataframe.jl b/src/abstractdataframe/abstractdataframe.jl
@@ -2502,136 +2502,6 @@ function Missings.allowmissing(df::AbstractDataFrame,
  return new_df
 end
 
-"""
- flatten(df::AbstractDataFrame, cols)
-
-When columns `cols` of data frame `df` have iterable elements that define
-`length` (for example a `Vector` of `Vector`s), return a `DataFrame` where each
-element of each `col` in `cols` is flattened, meaning the column corresponding
-to `col` becomes a longer vector where the original entries are concatenated.
-Elements of row `i` of `df` in columns other than `cols` will be repeated
-according to the length of `df[i, col]`. These lengths must therefore be the
-same for each `col` in `cols`, or else an error is raised. Note that these
-elements are not copied, and thus if they are mutable changing them in the
-returned `DataFrame` will affect `df`.
-
-`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
-
-$METADATA_FIXED
-
-# Examples
-
-```jldoctest
-julia> df1 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7, 8]])
-2×3 DataFrame
- Row │ a b c
- │ Int64 Array… Array…
-─────┼───────────────────────
- 1 │ 1 [1, 2] [5, 6]
- 2 │ 2 [3, 4] [7, 8]
-
-julia> flatten(df1, :b)
-4×3 DataFrame
- Row │ a b c
- │ Int64 Int64 Array…
-─────┼──────────────────────
- 1 │ 1 1 [5, 6]
- 2 │ 1 2 [5, 6]
- 3 │ 2 3 [7, 8]
- 4 │ 2 4 [7, 8]
-
-julia> flatten(df1, [:b, :c])
-4×3 DataFrame
- Row │ a b c
- │ Int64 Int64 Int64
-─────┼─────────────────────
- 1 │ 1 1 5
- 2 │ 1 2 6
- 3 │ 2 3 7
- 4 │ 2 4 8
-
-julia> df2 = DataFrame(a=[1, 2], b=[("p", "q"), ("r", "s")])
-2×2 DataFrame
- Row │ a b
- │ Int64 Tuple…
-─────┼───────────────────
- 1 │ 1 ("p", "q")
- 2 │ 2 ("r", "s")
-
-julia> flatten(df2, :b)
-4×2 DataFrame
- Row │ a b
- │ Int64 String
-─────┼───────────────
- 1 │ 1 p
- 2 │ 1 q
- 3 │ 2 r
- 4 │ 2 s
-
-julia> df3 = DataFrame(a=[1, 2], b=[[1, 2], [3, 4]], c=[[5, 6], [7]])
-2×3 DataFrame
- Row │ a b c
- │ Int64 Array… Array…
-─────┼───────────────────────
- 1 │ 1 [1, 2] [5, 6]
- 2 │ 2 [3, 4] [7]
-
-julia> flatten(df3, [:b, :c])
-ERROR: ArgumentError: Lengths of iterables stored in columns :b and :c are not the same in row 2
-```
-"""
-function flatten(df::AbstractDataFrame,
- cols::Union{ColumnIndex, MultiColumnIndex})
- _check_consistency(df)
-
- idxcols = index(df)[cols]
- if isempty(idxcols)
- cdf = copy(df)
- _drop_all_nonnote_metadata!(cdf)
- return cdf
- end
-
- col1 = first(idxcols)
- lengths = length.(df[!, col1])
- for col in idxcols
- v = df[!, col]
- if any(x -> length(x[1]) != x[2], zip(v, lengths))
- r = findfirst(x -> x != 0, length.(v) .- lengths)
- colnames = _names(df)
- throw(ArgumentError("Lengths of iterables stored in columns :$(colnames[col1]) " *
- "and :$(colnames[col]) are not the same in row $r"))
- end
- end
-
- new_df = similar(df[!, Not(cols)], sum(lengths))
- for name in _names(new_df)
- repeat_lengths!(new_df[!, name], df[!, name], lengths)
- end
- length(idxcols) > 1 && sort!(idxcols)
- for col in idxcols
- col_to_flatten = df[!, col]
- fast_path = eltype(col_to_flatten) isa AbstractVector &&
- !isempty(col_to_flatten)
- flattened_col = fast_path ?
- reduce(vcat, col_to_flatten) :
- collect(Iterators.flatten(col_to_flatten))
- insertcols!(new_df, col, _names(df)[col] => flattened_col)
- end
-
- _copy_all_note_metadata!(new_df, df)
- return new_df
-end
-
-function repeat_lengths!(longnew::AbstractVector, shortold::AbstractVector,
- lengths::AbstractVector{Int})
- counter = 1
- @inbounds for i in eachindex(shortold)
- l = lengths[i]
- longnew[counter:(counter + l - 1)] .= Ref(shortold[i])
- counter += l
- end
-end
-
 # Disallowed getindex and setindex! operations that are a common mistake
 
 Base.getindex(::AbstractDataFrame, ::Union{Symbol, Integer, AbstractString}) =