Implement to_csv

SciSharp · Nov 7, 2020 · 4fa2fb8 · 4fa2fb8
1 parent 5301a7a
commit 4fa2fb8
Show file tree

Hide file tree

Showing 5 changed files with 276 additions and 4 deletions.
diff --git a/src/Pandas.NET/Extensions/PandasMethods.Excel.cs b/src/Pandas.NET/Extensions/PandasMethods.Excel.cs
@@ -1,13 +1,14 @@
 using System;
 using System.Collections.Generic;
 using System.Text;
+using PandasNet.Impl;
 
 namespace PandasNet
 {
     public static class PandasMethods
     {
         /// <summary>
-        /// 
+        /// Read a comma-separated values (csv) file into DataFrame.
         /// </summary>
         /// <param name="pd"></param>
         /// <param name="filepath_or_buffer"></param>
@@ -18,5 +19,36 @@ public static IDataFrame read_csv(this Pandas pd, string filepath, string sep =
         {
             throw new NotImplementedException();
         }
+
+        /// <summary>
+        /// Write object to a comma-separated values (csv) file.
+        /// </summary>
+        /// <param name="filepath">File path.</param>
+        /// <param name="sep">Field delimiter for the output file.</param>
+        /// <param name="na_rep">Missing data representation.</param>
+        /// <param name="float_format">Format string for floating point numbers.</param>
+        /// <param name="columns">Columns to write.</param>
+        /// <param name="header">Write out the column names.</param>
+        /// <param name="quoting">
+        /// Defaults to QUOTE_MINIMAL. If you have set a float_format then
+        /// floats are converted to strings and thus QUOTE_NONNUMERIC will
+        /// treat them as non-numeric.
+        /// </param>
+        /// <param name="quotechar">Character used to quote fields.</param>
+        /// <param name="line_terminator">
+        /// The newline character or character sequence to use in the output
+        /// file. Defaults to os.linesep, which depends on the OS in which this
+        /// method is called (‘n’ for linux, ‘rn’ for Windows, i.e.).
+        /// </param>
+        public static void to_csv(this IDataFrame df, string filepath, char sep = ',',
+            string na_rep = "", string float_format = null, IEnumerable<string> columns = null,
+            bool header = true, int quoting = (int) CsvQuoteStyle.QUOTE_MINIMAL,
+            char quotechar = '"', string line_terminator = null)
+        {
+            new CsvWriter(sep, na_rep, float_format, header,
+                (CsvQuoteStyle) quoting, quotechar, string.IsNullOrEmpty(
+                    line_terminator) ? Environment.NewLine : line_terminator,
+                new UTF8Encoding(false)).Write(filepath, df, columns);
+        }
     }
 }
diff --git a/src/Pandas.NET/Impl/CsvWriter.cs b/src/Pandas.NET/Impl/CsvWriter.cs
@@ -0,0 +1,158 @@
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using NumSharp;
+
+namespace PandasNet.Impl
+{
+    internal class CsvWriter
+    {
+        private readonly byte delimiter;
+        private readonly byte[] noValue;
+        private readonly string floatFormat;
+        private readonly bool header;
+        private readonly CsvQuoteStyle quotingStyle;
+        private readonly byte quotebyte;
+        private readonly char[] specialChars;
+        private readonly Encoding encoding;
+        private readonly byte[] lfBytes;
+
+        internal CsvWriter(char sep, string na_rep, string floatFormat,
+            bool header, CsvQuoteStyle quotingStyle, char quotechar,
+            string lineTerminator, Encoding encoding)
+        {
+            delimiter = (byte) sep;
+            noValue = encoding.GetBytes(na_rep);
+            this.floatFormat = floatFormat;
+            this.header = header;
+            this.quotingStyle = quotingStyle;
+            quotebyte = (byte) quotechar;
+            specialChars = lineTerminator.Length > 1 ?
+                new char[] { sep, quotechar, lineTerminator[0], lineTerminator[1] }
+                : new char[] { sep, quotechar, lineTerminator[0] };
+            this.encoding = encoding;
+            lfBytes = encoding.GetBytes(lineTerminator);
+        }
+
+        internal void Write(string filepath, IDataFrame df,
+            IEnumerable<string> columns)
+        {
+            var columnLabels = columns == null ?
+                df.Columns.Values.Data<string>() : columns.ToArray();
+            var columnCount = columnLabels.Length;
+            int rowCount = df.Index.Size;
+            var data = df[columnLabels].Values;
+            using (var fs = File.Create(filepath))
+            {
+                if (columnCount == 0) { return; }
+                else if (header) { WriteHeader(fs, columnLabels); }
+                for (var i = 0; i < rowCount; i++)
+                {
+                    WriteField(data[i][0], fs);
+                    for (var j = 1; j < columnCount; j++)
+                    {
+                        fs.WriteByte(delimiter);
+                        WriteField(data[i][j], fs);
+                    }
+                    fs.Write(lfBytes, 0, lfBytes.Length);
+                }
+            }
+        }
+
+        private void WriteField(NDArray fieldValue, Stream fs)
+        {
+            var needsQuoting = NeedsQuoting(fieldValue);
+            if (needsQuoting) { fs.WriteByte(quotebyte); }
+            var bytes = noValue;
+            if (fieldValue.size > 0)
+            {
+                var fieldValueFormatted = floatFormat != null &&
+                    (fieldValue.dtype == np.float32 || fieldValue.dtype == np.float64)
+                    ? ((double) fieldValue).ToString(floatFormat)
+                    : fieldValue.ToString();
+                bytes = encoding.GetBytes(fieldValueFormatted);
+            }
+            fs.Write(bytes, 0, bytes.Length);
+            if (needsQuoting) { fs.WriteByte(quotebyte); }
+        }
+
+        private bool NeedsQuoting(object field)
+        {
+            switch (quotingStyle)
+            {
+                case CsvQuoteStyle.QUOTE_MINIMAL:
+                    return !IsNumber(field) && -1 != field.ToString().IndexOfAny(specialChars);
+                case CsvQuoteStyle.QUOTE_ALL:
+                    return true;
+                case CsvQuoteStyle.QUOTE_NONNUMERIC:
+                    return !IsNumber(field);
+                case CsvQuoteStyle.QUOTE_NONE:
+                    return false;
+                default:
+                    throw new ArgumentException("Invalid value", nameof(quotingStyle));
+            }
+        }
+
+        /// <summary>
+        /// Writes the columnLabels on one line to the FileStream.
+        /// </summary>
+        /// <param name="fs">Output stream</param>
+        /// <param name="encoding">Byte encoding used</param>
+        /// <param name="columnLabels">Column names</param>
+        /// <param name="delimiter">Separator for columns</param>
+        /// <param name="lfBytes">Line-break bytes.</param>
+        private void WriteHeader(Stream fs, string[] columnLabels)
+        {
+            var bytes = encoding.GetBytes(columnLabels[0]);
+            fs.Write(bytes, 0, bytes.Length);
+            for (var i = 1; i < columnLabels.Length; i++)
+            {
+                fs.WriteByte(delimiter);
+                bytes = encoding.GetBytes(columnLabels[i]);
+                fs.Write(bytes, 0, bytes.Length);
+            }
+            fs.Write(lfBytes, 0, lfBytes.Length);
+        }
+
+        private static bool IsNumber(object value)
+        {
+            return value is sbyte || value is byte || value is short ||
+                value is ushort || value is int || value is uint ||
+                value is long || value is ulong || value is float ||
+                value is double || value is decimal;
+        }
+    }
+
+    internal enum CsvQuoteStyle
+    {
+        /// <summary>
+        /// Instructs writer objects to only quote those fields which
+        /// contain special characters such as delimiter, quotechar or any
+        /// of the characters in lineterminator.
+        /// </summary>
+        QUOTE_MINIMAL = 0,
+        /// <summary>
+        /// Instructs writer objects to quote all fields.
+        /// </summary>
+        QUOTE_ALL = 1,
+        /// <summary>
+        /// <para>Instructs writer objects to quote all non-numeric
+        /// fields.</para>
+        /// <para>Instructs the reader to convert all non-quoted fields
+        /// to type float.</para>
+        /// </summary>
+        QUOTE_NONNUMERIC = 2,
+        /// <summary>
+        /// <para>Instructs writer objects to never quote fields. When the
+        /// current delimiter occurs in output data it is preceded by the
+        /// current escapechar character. If escapechar is not set, the
+        /// writer will raise Error if any characters that require escaping
+        /// are encountered.</para>
+        /// <para>Instructs reader to perform no special processing of
+        /// quote characters.</para>
+        /// </summary>
+        QUOTE_NONE = 3
+    }
+}
diff --git a/src/Pandas.NET/Impl/DataFrame.cs b/src/Pandas.NET/Impl/DataFrame.cs
@@ -10,8 +10,14 @@ namespace PandasNet.Impl
 {
     public partial class DataFrame<TIndex> : PandasObject, IDataFrame
     {
+        /// <summary>
+        /// The index (row labels) of the DataFrame.
+        /// </summary>
         public IDataIndex Index { get; internal set; }
 
+        /// <summary>
+        /// The column labels of the DataFrame.
+        /// </summary>
         public IDataIndex Columns { get; internal set; }
 
         /// <summary>
@@ -140,7 +146,7 @@ public IDataFrame this[params int[] columnIndexs]
             get
             {
                 var colLength = columnIndexs.Length;
-                NDArray array = new object[_rowSize, colLength];
+                NDArray array = new NDArray(Values.dtype, new Shape(_rowSize, colLength));
                 for (var rowIndex = 0; rowIndex < _rowSize; rowIndex++)
                 {
                     for (var col = 0; col < colLength; col++)

diff --git a/src/Pandas.NET/Pandas.Net.csproj b/src/Pandas.NET/Pandas.Net.csproj
@@ -1,7 +1,7 @@
-<Project Sdk="Microsoft.NET.Sdk">
+<Project Sdk="Microsoft.NET.Sdk">
 
   <PropertyGroup>
-        <TargetFramework>netstandard2.0</TargetFramework>
+    <TargetFramework>netstandard2.0</TargetFramework>
     <RootNamespace>PandasNet</RootNamespace>
     <Version>0.1.0</Version>
     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>

diff --git a/test/Pandas.NET.Test/DataFrameCsvTest.cs b/test/Pandas.NET.Test/DataFrameCsvTest.cs
@@ -0,0 +1,76 @@
+using Xunit;
+using NumSharp;
+using System.Linq;
+using System.IO;
+
+namespace PandasNet.Test
+{
+	public class DataFrameCsvTest
+	{
+		public DataFrameCsvTest()
+		{}
+
+		[Fact]
+		public void WriteCsv_ToFile_Test()
+		{
+			var filepath = "write_test.csv";
+			var array = np.arange(100).reshape(20, 5);
+			var columnNames = new string[] { "first", "second", "third",
+				"fourth", "fifth" };
+			var pd = new Pandas();
+			IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
+			df1.to_csv(filepath);
+			using (var fr = File.OpenText(filepath))
+			{
+				Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
+				for (var i = 0; i < array.shape[0]; i++)
+				{
+					Assert.Equal(string.Join(',', array[i].Data<int>()), fr.ReadLine());
+				}
+			}
+		}
+
+		[Fact]
+		public void WriteCsvQuoted_ToFile_Test()
+		{
+			var filepath = "write_quoted_test.csv";
+			var array = np.arange(100).reshape(20, 5);
+			var columnNames = new string[] { "first", "second", "third",
+				"fourth", "fifth" };
+			var pd = new Pandas();
+			IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
+			df1.to_csv(filepath, quoting: 1);
+			using (var fr = File.OpenText(filepath))
+			{
+				Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
+				for (var i = 0; i < array.shape[0]; i++)
+				{
+					Assert.Equal('"' + string.Join("\",\"", array[i].Data<int>()) + '"', fr.ReadLine());
+				}
+			}
+		}
+
+		[Fact]
+		public void WriteCsvFormated_ToFile_Test()
+		{
+			var filepath = "write_quoted_test.csv";
+			var array = np.arange(0, 50, 0.5).reshape(20, 5);
+			var columnNames = new string[] { "first", "second", "third",
+				"fourth", "fifth" };
+			var floatFormat = "E03";
+			var pd = new Pandas();
+			IDataFrame df1 = pd.DataFrame(array, null, columnNames, typeof(object));
+			df1.to_csv(filepath, float_format: floatFormat);
+			using (var fr = File.OpenText(filepath))
+			{
+				Assert.Equal(string.Join(',', columnNames), fr.ReadLine());
+				for (var i = 0; i < array.shape[0]; i++)
+				{
+					var formattedData = array[i].Data<double>().Select(
+						x => x.ToString(floatFormat));
+					Assert.Equal(string.Join(",", formattedData), fr.ReadLine());
+				}
+			}
+		}
+	}
+}