diff --git a/src/Pandas.NET/Extensions/PandasMethods.Excel.cs b/src/Pandas.NET/Extensions/PandasMethods.Excel.cs index 1efb4b6..9f28849 100644 --- a/src/Pandas.NET/Extensions/PandasMethods.Excel.cs +++ b/src/Pandas.NET/Extensions/PandasMethods.Excel.cs @@ -1,13 +1,14 @@ using System; using System.Collections.Generic; using System.Text; +using PandasNet.Impl; namespace PandasNet { public static class PandasMethods { /// - /// + /// Read a comma-separated values (csv) file into DataFrame. /// /// /// @@ -18,5 +19,36 @@ public static IDataFrame read_csv(this Pandas pd, string filepath, string sep = { throw new NotImplementedException(); } + + /// + /// Write object to a comma-separated values (csv) file. + /// + /// File path. + /// Field delimiter for the output file. + /// Missing data representation. + /// Format string for floating point numbers. + /// Columns to write. + /// Write out the column names. + /// + /// Defaults to QUOTE_MINIMAL. If you have set a float_format then + /// floats are converted to strings and thus QUOTE_NONNUMERIC will + /// treat them as non-numeric. + /// + /// Character used to quote fields. + /// + /// The newline character or character sequence to use in the output + /// file. Defaults to os.linesep, which depends on the OS in which this + /// method is called (‘n’ for linux, ‘rn’ for Windows, i.e.). + /// + public static void to_csv(this IDataFrame df, string filepath, char sep = ',', + string na_rep = "", string float_format = null, IEnumerable columns = null, + bool header = true, int quoting = (int) CsvQuoteStyle.QUOTE_MINIMAL, + char quotechar = '"', string line_terminator = null) + { + new CsvWriter(sep, na_rep, float_format, header, + (CsvQuoteStyle) quoting, quotechar, string.IsNullOrEmpty( + line_terminator) ? Environment.NewLine : line_terminator, + new UTF8Encoding(false)).Write(filepath, df, columns); + } } } diff --git a/src/Pandas.NET/Impl/CsvWriter.cs b/src/Pandas.NET/Impl/CsvWriter.cs new file mode 100644 index 0000000..5f59e6d --- /dev/null +++ b/src/Pandas.NET/Impl/CsvWriter.cs @@ -0,0 +1,158 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using NumSharp; + +namespace PandasNet.Impl +{ + internal class CsvWriter + { + private readonly byte delimiter; + private readonly byte[] noValue; + private readonly string floatFormat; + private readonly bool header; + private readonly CsvQuoteStyle quotingStyle; + private readonly byte quotebyte; + private readonly char[] specialChars; + private readonly Encoding encoding; + private readonly byte[] lfBytes; + + internal CsvWriter(char sep, string na_rep, string floatFormat, + bool header, CsvQuoteStyle quotingStyle, char quotechar, + string lineTerminator, Encoding encoding) + { + delimiter = (byte) sep; + noValue = encoding.GetBytes(na_rep); + this.floatFormat = floatFormat; + this.header = header; + this.quotingStyle = quotingStyle; + quotebyte = (byte) quotechar; + specialChars = lineTerminator.Length > 1 ? + new char[] { sep, quotechar, lineTerminator[0], lineTerminator[1] } + : new char[] { sep, quotechar, lineTerminator[0] }; + this.encoding = encoding; + lfBytes = encoding.GetBytes(lineTerminator); + } + + internal void Write(string filepath, IDataFrame df, + IEnumerable columns) + { + var columnLabels = columns == null ? + df.Columns.Values.Data() : columns.ToArray(); + var columnCount = columnLabels.Length; + int rowCount = df.Index.Size; + var data = df[columnLabels].Values; + using (var fs = File.Create(filepath)) + { + if (columnCount == 0) { return; } + else if (header) { WriteHeader(fs, columnLabels); } + for (var i = 0; i < rowCount; i++) + { + WriteField(data[i][0], fs); + for (var j = 1; j < columnCount; j++) + { + fs.WriteByte(delimiter); + WriteField(data[i][j], fs); + } + fs.Write(lfBytes, 0, lfBytes.Length); + } + } + } + + private void WriteField(NDArray fieldValue, Stream fs) + { + var needsQuoting = NeedsQuoting(fieldValue); + if (needsQuoting) { fs.WriteByte(quotebyte); } + var bytes = noValue; + if (fieldValue.size > 0) + { + var fieldValueFormatted = floatFormat != null && + (fieldValue.dtype == np.float32 || fieldValue.dtype == np.float64) + ? ((double) fieldValue).ToString(floatFormat) + : fieldValue.ToString(); + bytes = encoding.GetBytes(fieldValueFormatted); + } + fs.Write(bytes, 0, bytes.Length); + if (needsQuoting) { fs.WriteByte(quotebyte); } + } + + private bool NeedsQuoting(object field) + { + switch (quotingStyle) + { + case CsvQuoteStyle.QUOTE_MINIMAL: + return !IsNumber(field) && -1 != field.ToString().IndexOfAny(specialChars); + case CsvQuoteStyle.QUOTE_ALL: + return true; + case CsvQuoteStyle.QUOTE_NONNUMERIC: + return !IsNumber(field); + case CsvQuoteStyle.QUOTE_NONE: + return false; + default: + throw new ArgumentException("Invalid value", nameof(quotingStyle)); + } + } + + /// + /// Writes the columnLabels on one line to the FileStream. + /// + /// Output stream + /// Byte encoding used + /// Column names + /// Separator for columns + /// Line-break bytes. + private void WriteHeader(Stream fs, string[] columnLabels) + { + var bytes = encoding.GetBytes(columnLabels[0]); + fs.Write(bytes, 0, bytes.Length); + for (var i = 1; i < columnLabels.Length; i++) + { + fs.WriteByte(delimiter); + bytes = encoding.GetBytes(columnLabels[i]); + fs.Write(bytes, 0, bytes.Length); + } + fs.Write(lfBytes, 0, lfBytes.Length); + } + + private static bool IsNumber(object value) + { + return value is sbyte || value is byte || value is short || + value is ushort || value is int || value is uint || + value is long || value is ulong || value is float || + value is double || value is decimal; + } + } + + internal enum CsvQuoteStyle + { + /// + /// Instructs writer objects to only quote those fields which + /// contain special characters such as delimiter, quotechar or any + /// of the characters in lineterminator. + /// + QUOTE_MINIMAL = 0, + /// + /// Instructs writer objects to quote all fields. + /// + QUOTE_ALL = 1, + /// + /// Instructs writer objects to quote all non-numeric + /// fields. + /// Instructs the reader to convert all non-quoted fields + /// to type float. + /// + QUOTE_NONNUMERIC = 2, + /// + /// Instructs writer objects to never quote fields. When the + /// current delimiter occurs in output data it is preceded by the + /// current escapechar character. If escapechar is not set, the + /// writer will raise Error if any characters that require escaping + /// are encountered. + /// Instructs reader to perform no special processing of + /// quote characters. + /// + QUOTE_NONE = 3 + } +} diff --git a/src/Pandas.NET/Impl/DataFrame.cs b/src/Pandas.NET/Impl/DataFrame.cs index ba741b8..61231ae 100644 --- a/src/Pandas.NET/Impl/DataFrame.cs +++ b/src/Pandas.NET/Impl/DataFrame.cs @@ -10,8 +10,14 @@ namespace PandasNet.Impl { public partial class DataFrame : PandasObject, IDataFrame { + /// + /// The index (row labels) of the DataFrame. + /// public IDataIndex Index { get; internal set; } + /// + /// The column labels of the DataFrame. + /// public IDataIndex Columns { get; internal set; } /// @@ -140,7 +146,7 @@ public IDataFrame this[params int[] columnIndexs] get { var colLength = columnIndexs.Length; - NDArray array = new object[_rowSize, colLength]; + NDArray array = new NDArray(Values.dtype, new Shape(_rowSize, colLength)); for (var rowIndex = 0; rowIndex < _rowSize; rowIndex++) { for (var col = 0; col < colLength; col++) diff --git a/src/Pandas.NET/Pandas.Net.csproj b/src/Pandas.NET/Pandas.Net.csproj index 48df23f..a762fa0 100644 --- a/src/Pandas.NET/Pandas.Net.csproj +++ b/src/Pandas.NET/Pandas.Net.csproj @@ -1,7 +1,7 @@ - + - netstandard2.0 + netstandard2.0 PandasNet 0.1.0 true diff --git a/test/Pandas.NET.Test/DataFrameCsvTest.cs b/test/Pandas.NET.Test/DataFrameCsvTest.cs new file mode 100644 index 0000000..293752f --- /dev/null +++ b/test/Pandas.NET.Test/DataFrameCsvTest.cs @@ -0,0 +1,76 @@ +using Xunit; +using NumSharp; +using System.Linq; +using System.IO; + +namespace PandasNet.Test +{ + public class DataFrameCsvTest + { + public DataFrameCsvTest() + {} + + [Fact] + public void WriteCsv_ToFile_Test() + { + var filepath = "write_test.csv"; + var array = np.arange(100).reshape(20, 5); + var columnNames = new string[] { "first", "second", "third", + "fourth", "fifth" }; + var pd = new Pandas(); + IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object)); + df1.to_csv(filepath); + using (var fr = File.OpenText(filepath)) + { + Assert.Equal(string.Join(',', columnNames), fr.ReadLine()); + for (var i = 0; i < array.shape[0]; i++) + { + Assert.Equal(string.Join(',', array[i].Data()), fr.ReadLine()); + } + } + } + + [Fact] + public void WriteCsvQuoted_ToFile_Test() + { + var filepath = "write_quoted_test.csv"; + var array = np.arange(100).reshape(20, 5); + var columnNames = new string[] { "first", "second", "third", + "fourth", "fifth" }; + var pd = new Pandas(); + IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object)); + df1.to_csv(filepath, quoting: 1); + using (var fr = File.OpenText(filepath)) + { + Assert.Equal(string.Join(',', columnNames), fr.ReadLine()); + for (var i = 0; i < array.shape[0]; i++) + { + Assert.Equal('"' + string.Join("\",\"", array[i].Data()) + '"', fr.ReadLine()); + } + } + } + + [Fact] + public void WriteCsvFormated_ToFile_Test() + { + var filepath = "write_quoted_test.csv"; + var array = np.arange(0, 50, 0.5).reshape(20, 5); + var columnNames = new string[] { "first", "second", "third", + "fourth", "fifth" }; + var floatFormat = "E03"; + var pd = new Pandas(); + IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object)); + df1.to_csv(filepath, float_format: floatFormat); + using (var fr = File.OpenText(filepath)) + { + Assert.Equal(string.Join(',', columnNames), fr.ReadLine()); + for (var i = 0; i < array.shape[0]; i++) + { + var formattedData = array[i].Data().Select( + x => x.ToString(floatFormat)); + Assert.Equal(string.Join(",", formattedData), fr.ReadLine()); + } + } + } + } +}