Skip to content

Commit

Permalink
Implement to_csv
Browse files Browse the repository at this point in the history
  • Loading branch information
bonzani authored and Oceania2018 committed Nov 7, 2020
1 parent 5301a7a commit 4fa2fb8
Show file tree
Hide file tree
Showing 5 changed files with 276 additions and 4 deletions.
34 changes: 33 additions & 1 deletion src/Pandas.NET/Extensions/PandasMethods.Excel.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
using System;
using System.Collections.Generic;
using System.Text;
using PandasNet.Impl;

namespace PandasNet
{
public static class PandasMethods
{
/// <summary>
///
/// Read a comma-separated values (csv) file into DataFrame.
/// </summary>
/// <param name="pd"></param>
/// <param name="filepath_or_buffer"></param>
Expand All @@ -18,5 +19,36 @@ public static IDataFrame read_csv(this Pandas pd, string filepath, string sep =
{
throw new NotImplementedException();
}

/// <summary>
/// Write object to a comma-separated values (csv) file.
/// </summary>
/// <param name="filepath">File path.</param>
/// <param name="sep">Field delimiter for the output file.</param>
/// <param name="na_rep">Missing data representation.</param>
/// <param name="float_format">Format string for floating point numbers.</param>
/// <param name="columns">Columns to write.</param>
/// <param name="header">Write out the column names.</param>
/// <param name="quoting">
/// Defaults to QUOTE_MINIMAL. If you have set a float_format then
/// floats are converted to strings and thus QUOTE_NONNUMERIC will
/// treat them as non-numeric.
/// </param>
/// <param name="quotechar">Character used to quote fields.</param>
/// <param name="line_terminator">
/// The newline character or character sequence to use in the output
/// file. Defaults to os.linesep, which depends on the OS in which this
/// method is called (‘n’ for linux, ‘rn’ for Windows, i.e.).
/// </param>
public static void to_csv(this IDataFrame df, string filepath, char sep = ',',
string na_rep = "", string float_format = null, IEnumerable<string> columns = null,
bool header = true, int quoting = (int) CsvQuoteStyle.QUOTE_MINIMAL,
char quotechar = '"', string line_terminator = null)
{
new CsvWriter(sep, na_rep, float_format, header,
(CsvQuoteStyle) quoting, quotechar, string.IsNullOrEmpty(
line_terminator) ? Environment.NewLine : line_terminator,
new UTF8Encoding(false)).Write(filepath, df, columns);
}
}
}
158 changes: 158 additions & 0 deletions src/Pandas.NET/Impl/CsvWriter.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using NumSharp;

namespace PandasNet.Impl
{
internal class CsvWriter
{
private readonly byte delimiter;
private readonly byte[] noValue;
private readonly string floatFormat;
private readonly bool header;
private readonly CsvQuoteStyle quotingStyle;
private readonly byte quotebyte;
private readonly char[] specialChars;
private readonly Encoding encoding;
private readonly byte[] lfBytes;

internal CsvWriter(char sep, string na_rep, string floatFormat,
bool header, CsvQuoteStyle quotingStyle, char quotechar,
string lineTerminator, Encoding encoding)
{
delimiter = (byte) sep;
noValue = encoding.GetBytes(na_rep);
this.floatFormat = floatFormat;
this.header = header;
this.quotingStyle = quotingStyle;
quotebyte = (byte) quotechar;
specialChars = lineTerminator.Length > 1 ?
new char[] { sep, quotechar, lineTerminator[0], lineTerminator[1] }
: new char[] { sep, quotechar, lineTerminator[0] };
this.encoding = encoding;
lfBytes = encoding.GetBytes(lineTerminator);
}

internal void Write(string filepath, IDataFrame df,
IEnumerable<string> columns)
{
var columnLabels = columns == null ?
df.Columns.Values.Data<string>() : columns.ToArray();
var columnCount = columnLabels.Length;
int rowCount = df.Index.Size;
var data = df[columnLabels].Values;
using (var fs = File.Create(filepath))
{
if (columnCount == 0) { return; }
else if (header) { WriteHeader(fs, columnLabels); }
for (var i = 0; i < rowCount; i++)
{
WriteField(data[i][0], fs);
for (var j = 1; j < columnCount; j++)
{
fs.WriteByte(delimiter);
WriteField(data[i][j], fs);
}
fs.Write(lfBytes, 0, lfBytes.Length);
}
}
}

private void WriteField(NDArray fieldValue, Stream fs)
{
var needsQuoting = NeedsQuoting(fieldValue);
if (needsQuoting) { fs.WriteByte(quotebyte); }
var bytes = noValue;
if (fieldValue.size > 0)
{
var fieldValueFormatted = floatFormat != null &&
(fieldValue.dtype == np.float32 || fieldValue.dtype == np.float64)
? ((double) fieldValue).ToString(floatFormat)
: fieldValue.ToString();
bytes = encoding.GetBytes(fieldValueFormatted);
}
fs.Write(bytes, 0, bytes.Length);
if (needsQuoting) { fs.WriteByte(quotebyte); }
}

private bool NeedsQuoting(object field)
{
switch (quotingStyle)
{
case CsvQuoteStyle.QUOTE_MINIMAL:
return !IsNumber(field) && -1 != field.ToString().IndexOfAny(specialChars);
case CsvQuoteStyle.QUOTE_ALL:
return true;
case CsvQuoteStyle.QUOTE_NONNUMERIC:
return !IsNumber(field);
case CsvQuoteStyle.QUOTE_NONE:
return false;
default:
throw new ArgumentException("Invalid value", nameof(quotingStyle));
}
}

/// <summary>
/// Writes the columnLabels on one line to the FileStream.
/// </summary>
/// <param name="fs">Output stream</param>
/// <param name="encoding">Byte encoding used</param>
/// <param name="columnLabels">Column names</param>
/// <param name="delimiter">Separator for columns</param>
/// <param name="lfBytes">Line-break bytes.</param>
private void WriteHeader(Stream fs, string[] columnLabels)
{
var bytes = encoding.GetBytes(columnLabels[0]);
fs.Write(bytes, 0, bytes.Length);
for (var i = 1; i < columnLabels.Length; i++)
{
fs.WriteByte(delimiter);
bytes = encoding.GetBytes(columnLabels[i]);
fs.Write(bytes, 0, bytes.Length);
}
fs.Write(lfBytes, 0, lfBytes.Length);
}

private static bool IsNumber(object value)
{
return value is sbyte || value is byte || value is short ||
value is ushort || value is int || value is uint ||
value is long || value is ulong || value is float ||
value is double || value is decimal;
}
}

internal enum CsvQuoteStyle
{
/// <summary>
/// Instructs writer objects to only quote those fields which
/// contain special characters such as delimiter, quotechar or any
/// of the characters in lineterminator.
/// </summary>
QUOTE_MINIMAL = 0,
/// <summary>
/// Instructs writer objects to quote all fields.
/// </summary>
QUOTE_ALL = 1,
/// <summary>
/// <para>Instructs writer objects to quote all non-numeric
/// fields.</para>
/// <para>Instructs the reader to convert all non-quoted fields
/// to type float.</para>
/// </summary>
QUOTE_NONNUMERIC = 2,
/// <summary>
/// <para>Instructs writer objects to never quote fields. When the
/// current delimiter occurs in output data it is preceded by the
/// current escapechar character. If escapechar is not set, the
/// writer will raise Error if any characters that require escaping
/// are encountered.</para>
/// <para>Instructs reader to perform no special processing of
/// quote characters.</para>
/// </summary>
QUOTE_NONE = 3
}
}
8 changes: 7 additions & 1 deletion src/Pandas.NET/Impl/DataFrame.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,14 @@ namespace PandasNet.Impl
{
public partial class DataFrame<TIndex> : PandasObject, IDataFrame
{
/// <summary>
/// The index (row labels) of the DataFrame.
/// </summary>
public IDataIndex Index { get; internal set; }

/// <summary>
/// The column labels of the DataFrame.
/// </summary>
public IDataIndex Columns { get; internal set; }

/// <summary>
Expand Down Expand Up @@ -140,7 +146,7 @@ public IDataFrame this[params int[] columnIndexs]
get
{
var colLength = columnIndexs.Length;
NDArray array = new object[_rowSize, colLength];
NDArray array = new NDArray(Values.dtype, new Shape(_rowSize, colLength));
for (var rowIndex = 0; rowIndex < _rowSize; rowIndex++)
{
for (var col = 0; col < colLength; col++)
Expand Down
4 changes: 2 additions & 2 deletions src/Pandas.NET/Pandas.Net.csproj
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>netstandard2.0</TargetFramework>
<TargetFramework>netstandard2.0</TargetFramework>
<RootNamespace>PandasNet</RootNamespace>
<Version>0.1.0</Version>
<GeneratePackageOnBuild>true</GeneratePackageOnBuild>
Expand Down
76 changes: 76 additions & 0 deletions test/Pandas.NET.Test/DataFrameCsvTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
using Xunit;
using NumSharp;
using System.Linq;
using System.IO;

namespace PandasNet.Test
{
public class DataFrameCsvTest
{
public DataFrameCsvTest()
{}

[Fact]
public void WriteCsv_ToFile_Test()
{
var filepath = "write_test.csv";
var array = np.arange(100).reshape(20, 5);
var columnNames = new string[] { "first", "second", "third",
"fourth", "fifth" };
var pd = new Pandas();
IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
df1.to_csv(filepath);
using (var fr = File.OpenText(filepath))
{
Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
for (var i = 0; i < array.shape[0]; i++)
{
Assert.Equal(string.Join(',', array[i].Data<int>()), fr.ReadLine());
}
}
}

[Fact]
public void WriteCsvQuoted_ToFile_Test()
{
var filepath = "write_quoted_test.csv";
var array = np.arange(100).reshape(20, 5);
var columnNames = new string[] { "first", "second", "third",
"fourth", "fifth" };
var pd = new Pandas();
IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
df1.to_csv(filepath, quoting: 1);
using (var fr = File.OpenText(filepath))
{
Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
for (var i = 0; i < array.shape[0]; i++)
{
Assert.Equal('"' + string.Join("\",\"", array[i].Data<int>()) + '"', fr.ReadLine());
}
}
}

[Fact]
public void WriteCsvFormated_ToFile_Test()
{
var filepath = "write_quoted_test.csv";
var array = np.arange(0, 50, 0.5).reshape(20, 5);
var columnNames = new string[] { "first", "second", "third",
"fourth", "fifth" };
var floatFormat = "E03";
var pd = new Pandas();
IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
df1.to_csv(filepath, float_format: floatFormat);
using (var fr = File.OpenText(filepath))
{
Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
for (var i = 0; i < array.shape[0]; i++)
{
var formattedData = array[i].Data<double>().Select(
x => x.ToString(floatFormat));
Assert.Equal(string.Join(",", formattedData), fr.ReadLine());
}
}
}
}
}

0 comments on commit 4fa2fb8

Please sign in to comment.