diff --git a/UPDATING.md b/UPDATING.md index 3810cda0e9b60..2a3555da577a6 100644 --- a/UPDATING.md +++ b/UPDATING.md @@ -250,6 +250,8 @@ If you run a production system you should schedule downtime for this upgrade. The PRs bellow have more information around the breaking changes: +* [9825](https://github.com/apache/incubator-superset/pull/9825): Support for Excel sheet upload added. To enable support, install Superset with the optional dependency `excel` + * [4587](https://github.com/apache/incubator-superset/pull/4587) : a backward incompatible database migration that requires downtime. Once the db migration succeeds, the web server needs to be restarted with the diff --git a/requirements.txt b/requirements.txt index 1ed3e7b721445..73e7be6272714 100644 --- a/requirements.txt +++ b/requirements.txt @@ -100,4 +100,4 @@ yarl==1.4.2 # via aiohttp zipp==3.1.0 # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: -# setuptools +# setuptools \ No newline at end of file diff --git a/setup.py b/setup.py index f2a4e1b3c31be..897c30595601d 100644 --- a/setup.py +++ b/setup.py @@ -123,6 +123,7 @@ def get_git_sha(): "dremio": ["sqlalchemy_dremio>=1.1.0"], "cockroachdb": ["cockroachdb==0.3.3"], "thumbnails": ["Pillow>=7.0.0, <8.0.0"], + "excel": ["xlrd>=1.2.0, <1.3"], }, python_requires="~=3.6", author="Apache Software Foundation", diff --git a/superset/app.py b/superset/app.py index 7dc7aec5f4250..c047a284f5ce1 100644 --- a/superset/app.py +++ b/superset/app.py @@ -159,7 +159,11 @@ def init_views(self) -> None: DashboardModelViewAsync, ) from superset.views.database.api import DatabaseRestApi - from superset.views.database.views import DatabaseView, CsvToDatabaseView + from superset.views.database.views import ( + DatabaseView, + CsvToDatabaseView, + ExcelToDatabaseView, + ) from superset.views.datasource import Datasource from superset.views.log.api import LogRestApi from superset.views.log.views import LogModelView @@ -265,6 +269,7 @@ def init_views(self) -> None: appbuilder.add_view_no_menu(Api) appbuilder.add_view_no_menu(CssTemplateAsyncModelView) appbuilder.add_view_no_menu(CsvToDatabaseView) + appbuilder.add_view_no_menu(ExcelToDatabaseView) appbuilder.add_view_no_menu(Dashboard) appbuilder.add_view_no_menu(DashboardModelViewAsync) appbuilder.add_view_no_menu(Datasource) @@ -324,15 +329,35 @@ def init_views(self) -> None: category="SQL Lab", category_label=__("SQL Lab"), ) - appbuilder.add_link( - "Upload a CSV", - label=__("Upload a CSV"), - href="/csvtodatabaseview/form", - icon="fa-upload", - category="Sources", - category_label=__("Sources"), - category_icon="fa-wrench", - ) + if self.config["CSV_EXTENSIONS"].intersection( + self.config["ALLOWED_EXTENSIONS"] + ): + appbuilder.add_link( + "Upload a CSV", + label=__("Upload a CSV"), + href="/csvtodatabaseview/form", + icon="fa-upload", + category="Sources", + category_label=__("Sources"), + category_icon="fa-wrench", + ) + try: + import xlrd # pylint: disable=unused-import + + if self.config["EXCEL_EXTENSIONS"].intersection( + self.config["ALLOWED_EXTENSIONS"] + ): + appbuilder.add_link( + "Upload Excel", + label=__("Upload Excel"), + href="/exceltodatabaseview/form", + icon="fa-upload", + category="Sources", + category_label=__("Sources"), + category_icon="fa-wrench", + ) + except ImportError: + pass # # Conditionally setup log views diff --git a/superset/config.py b/superset/config.py index 5ef169cbc4c1d..89b813fdbb31e 100644 --- a/superset/config.py +++ b/superset/config.py @@ -365,8 +365,9 @@ def _try_json_readsha( # pylint: disable=unused-argument SUPERSET_WEBSERVER_DOMAINS = None # Allowed format types for upload on Database view -# TODO: Add processing of other spreadsheet formats (xls, xlsx etc) -ALLOWED_EXTENSIONS = {"csv", "tsv"} +EXCEL_EXTENSIONS = {"xlsx", "xls"} +CSV_EXTENSIONS = {"csv", "tsv"} +ALLOWED_EXTENSIONS = {*EXCEL_EXTENSIONS, *CSV_EXTENSIONS} # CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv # method. diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index c86ee9d0d3171..8375eb95cf976 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -430,6 +430,20 @@ def set_or_update_query_limit(cls, sql: str, limit: int) -> str: parsed_query = sql_parse.ParsedQuery(sql) return parsed_query.set_or_update_query_limit(limit) + @staticmethod + def excel_to_df(**kwargs: Any) -> pd.DataFrame: + """ Read excel into Pandas DataFrame + :param kwargs: params to be passed to DataFrame.read_excel + :return: Pandas DataFrame containing data from excel + """ + kwargs["encoding"] = "utf-8" + kwargs["iterator"] = True + chunks = pd.io.excel.read_excel( + io=kwargs["filepath_or_buffer"], sheet_name=kwargs["sheet_name"] + ) + df = pd.concat(chunk for chunk in chunks.values()) + return df + @staticmethod def csv_to_df(**kwargs: Any) -> pd.DataFrame: """ Read csv into Pandas DataFrame @@ -486,6 +500,28 @@ def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]: """ return None + @classmethod + def create_table_from_excel( # pylint: disable=too-many-arguments + cls, + filename: str, + table: Table, + database: "Database", + excel_to_df_kwargs: Dict[str, Any], + df_to_sql_kwargs: Dict[str, Any], + ) -> None: + """ + Create table from contents of a excel. Note: this method does not create + metadata for the table. + """ + df = cls.excel_to_df(filepath_or_buffer=filename, **excel_to_df_kwargs,) + engine = cls.get_engine(database) + if table.schema: + # only add schema when it is preset and non empty + df_to_sql_kwargs["schema"] = table.schema + if engine.dialect.supports_multivalues_insert: + df_to_sql_kwargs["method"] = "multi" + cls.df_to_sql(df=df, con=engine, **df_to_sql_kwargs) + @classmethod def get_all_datasource_names( cls, database: "Database", datasource_type: str diff --git a/superset/templates/superset/form_view/excel_to_database_view/edit.html b/superset/templates/superset/form_view/excel_to_database_view/edit.html new file mode 100644 index 0000000000000..dcfd6d2ce6a5a --- /dev/null +++ b/superset/templates/superset/form_view/excel_to_database_view/edit.html @@ -0,0 +1,64 @@ +{# + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +#} +{% extends 'appbuilder/general/model/edit.html' %} + +{% block tail_js %} + {{ super() }} + +{% endblock %} \ No newline at end of file diff --git a/superset/views/database/forms.py b/superset/views/database/forms.py index b57caff40457d..13059a4b094f0 100644 --- a/superset/views/database/forms.py +++ b/superset/views/database/forms.py @@ -91,11 +91,15 @@ def at_least_one_schema_is_allowed(database: Database) -> bool: validators=[ FileRequired(), FileAllowed( - config["ALLOWED_EXTENSIONS"], + config["ALLOWED_EXTENSIONS"].intersection(config["CSV_EXTENSIONS"]), _( "Only the following file extensions are allowed: " "%(allowed_extensions)s", - allowed_extensions=", ".join(config["ALLOWED_EXTENSIONS"]), + allowed_extensions=", ".join( + config["ALLOWED_EXTENSIONS"].intersection( + config["CSV_EXTENSIONS"] + ) + ), ), ), ], @@ -206,3 +210,169 @@ def at_least_one_schema_is_allowed(database: Database) -> bool: validators=[Optional()], widget=BS3TextFieldWidget(), ) + + +class ExcelToDatabaseForm(DynamicForm): + # pylint: disable=E0211 + def excel_allowed_dbs(): # type: ignore + excel_allowed_dbs = [] + # TODO: change allow_csv_upload to allow_file_upload + excel_enabled_dbs = ( + db.session.query(Database).filter_by(allow_csv_upload=True).all() + ) + for excel_enabled_db in excel_enabled_dbs: + if ExcelToDatabaseForm.at_least_one_schema_is_allowed(excel_enabled_db): + excel_allowed_dbs.append(excel_enabled_db) + return excel_allowed_dbs + + @staticmethod + def at_least_one_schema_is_allowed(database: Database) -> bool: + """ + If the user has access to the database or all datasource + 1. if schemas_allowed_for_csv_upload is empty + a) if database does not support schema + user is able to upload excel without specifying schema name + b) if database supports schema + user is able to upload excel to any schema + 2. if schemas_allowed_for_csv_upload is not empty + a) if database does not support schema + This situation is impossible and upload will fail + b) if database supports schema + user is able to upload to schema in schemas_allowed_for_csv_upload + elif the user does not access to the database or all datasource + 1. if schemas_allowed_for_csv_upload is empty + a) if database does not support schema + user is unable to upload excel + b) if database supports schema + user is unable to upload excel + 2. if schemas_allowed_for_csv_upload is not empty + a) if database does not support schema + This situation is impossible and user is unable to upload excel + b) if database supports schema + user is able to upload to schema in schemas_allowed_for_csv_upload + """ + if ( + security_manager.database_access(database) + or security_manager.all_datasource_access() + ): + return True + schemas = database.get_schema_access_for_csv_upload() + if schemas and security_manager.schemas_accessible_by_user( + database, schemas, False + ): + return True + return False + + name = StringField( + _("Table Name"), + description=_("Name of table to be created from excel data."), + validators=[DataRequired()], + widget=BS3TextFieldWidget(), + ) + excel_file = FileField( + _("Excel File"), + description=_("Select a Excel file to be uploaded to a database."), + validators=[ + FileRequired(), + FileAllowed( + config["ALLOWED_EXTENSIONS"].intersection(config["EXCEL_EXTENSIONS"]), + _( + "Only the following file extensions are allowed: " + "%(allowed_extensions)s", + allowed_extensions=", ".join( + config["ALLOWED_EXTENSIONS"].intersection( + config["EXCEL_EXTENSIONS"] + ) + ), + ), + ), + ], + ) + + sheet_name = StringField( + _("Sheet Name"), description="Sheet Name", validators=[Optional()] + ) + + con = QuerySelectField( + _("Database"), + query_factory=excel_allowed_dbs, + get_pk=lambda a: a.id, + get_label=lambda a: a.database_name, + ) + schema = StringField( + _("Schema"), + description=_("Specify a schema (if database flavor supports this)."), + validators=[Optional()], + widget=BS3TextFieldWidget(), + ) + if_exists = SelectField( + _("Table Exists"), + description=_( + "If table exists do one of the following: " + "Fail (do nothing), Replace (drop and recreate table) " + "or Append (insert data)." + ), + choices=[ + ("fail", _("Fail")), + ("replace", _("Replace")), + ("append", _("Append")), + ], + validators=[DataRequired()], + ) + header = IntegerField( + _("Header Row"), + description=_( + "Row containing the headers to use as " + "column names (0 is first line of data). " + "Leave empty if there is no header row." + ), + validators=[Optional(), NumberRange(min=0)], + widget=BS3TextFieldWidget(), + ) + index_col = IntegerField( + _("Index Column"), + description=_( + "Column to use as the row labels of the " + "dataframe. Leave empty if no index column." + ), + validators=[Optional(), NumberRange(min=0)], + widget=BS3TextFieldWidget(), + ) + mangle_dupe_cols = BooleanField( + _("Mangle Duplicate Columns"), + description=_('Specify duplicate columns as "X.0, X.1".'), + ) + skipinitialspace = BooleanField( + _("Skip Initial Space"), description=_("Skip spaces after delimiter.") + ) + skiprows = IntegerField( + _("Skip Rows"), + description=_("Number of rows to skip at start of file."), + validators=[Optional(), NumberRange(min=0)], + widget=BS3TextFieldWidget(), + ) + nrows = IntegerField( + _("Rows to Read"), + description=_("Number of rows of file to read."), + validators=[Optional(), NumberRange(min=0)], + widget=BS3TextFieldWidget(), + ) + decimal = StringField( + _("Decimal Character"), + default=".", + description=_("Character to interpret as decimal point."), + validators=[Optional(), Length(min=1, max=1)], + widget=BS3TextFieldWidget(), + ) + index = BooleanField( + _("Dataframe Index"), description=_("Write dataframe index as a column.") + ) + index_label = StringField( + _("Column Label(s)"), + description=_( + "Column label for index column(s). If None is given " + "and Dataframe Index is True, Index Names are used." + ), + validators=[Optional()], + widget=BS3TextFieldWidget(), + ) diff --git a/superset/views/database/views.py b/superset/views/database/views.py index ae4b3bc116656..c8153d11bd89c 100644 --- a/superset/views/database/views.py +++ b/superset/views/database/views.py @@ -20,9 +20,9 @@ from flask import flash, g, redirect from flask_appbuilder import SimpleFormView -from flask_appbuilder.forms import DynamicForm from flask_appbuilder.models.sqla.interface import SQLAInterface from flask_babel import lazy_gettext as _ +from werkzeug.wrappers import Response from wtforms.fields import StringField from wtforms.validators import ValidationError @@ -32,12 +32,10 @@ from superset.constants import RouteMethod from superset.exceptions import CertificateException from superset.sql_parse import Table -from superset.typing import FlaskResponse from superset.utils import core as utils from superset.views.base import DeleteMixin, SupersetModelView, YamlExportMixin -from superset.views.database.forms import CsvToDatabaseForm -from .forms import CsvToDatabaseForm +from .forms import CsvToDatabaseForm, ExcelToDatabaseForm from .mixins import DatabaseMixin from .validators import schema_allows_csv_upload, sqlalchemy_uri_validator @@ -48,9 +46,7 @@ stats_logger = config["STATS_LOGGER"] -def sqlalchemy_uri_form_validator( # pylint: disable=unused-argument - form: DynamicForm, field: StringField -) -> None: +def sqlalchemy_uri_form_validator(_: _, field: StringField) -> None: """ Check if user has submitted a valid SQLAlchemy URI """ @@ -58,9 +54,7 @@ def sqlalchemy_uri_form_validator( # pylint: disable=unused-argument sqlalchemy_uri_validator(field.data, exception=ValidationError) -def certificate_form_validator( # pylint: disable=unused-argument - form: DynamicForm, field: StringField -) -> None: +def certificate_form_validator(_: _, field: StringField) -> None: """ Check if user has submitted a valid SSL certificate """ @@ -116,7 +110,7 @@ def form_get(self, form: CsvToDatabaseForm) -> None: form.decimal.data = "." form.if_exists.data = "fail" - def form_post(self, form: CsvToDatabaseForm) -> FlaskResponse: + def form_post(self, form: CsvToDatabaseForm) -> Response: database = form.con.data csv_table = Table(table=form.name.data, schema=form.schema.data) @@ -249,3 +243,149 @@ def form_post(self, form: CsvToDatabaseForm) -> FlaskResponse: flash(message, "info") stats_logger.incr("successful_csv_upload") return redirect("/tablemodelview/list/") + + +class ExcelToDatabaseView(SimpleFormView): + form = ExcelToDatabaseForm + form_template = "superset/form_view/excel_to_database_view/edit.html" + form_title = _("Excel to Database configuration") + add_columns = ["database", "schema", "table_name"] + + def form_get(self, form: ExcelToDatabaseForm) -> None: + form.header.data = 0 + form.mangle_dupe_cols.data = True + form.skipinitialspace.data = False + form.decimal.data = "." + form.if_exists.data = "fail" + form.sheet_name = None + + def form_post(self, form: ExcelToDatabaseForm) -> Response: + database = form.con.data + excel_table = Table(table=form.name.data, schema=form.schema.data) + + if not schema_allows_csv_upload(database, excel_table.schema): + message = _( + 'Database "%(database_name)s" schema "%(schema_name)s" ' + "is not allowed for excel uploads. Please contact your Superset Admin.", + database_name=database.database_name, + schema_name=excel_table.schema, + ) + flash(message, "danger") + return redirect("/exceltodatabaseview/form") + + if "." in excel_table.table and excel_table.schema: + message = _( + "You cannot specify a namespace both in the name of the table: " + '"%(excel_table.table)s" and in the schema field: ' + '"%(excel_table.schema)s". Please remove one', + table=excel_table.table, + schema=excel_table.schema, + ) + flash(message, "danger") + return redirect("/exceltodatabaseview/form") + + uploaded_tmp_file_path = tempfile.NamedTemporaryFile( + dir=app.config["UPLOAD_FOLDER"], + suffix=os.path.splitext(form.excel_file.data.filename)[1].lower(), + delete=False, + ).name + + try: + utils.ensure_path_exists(config["UPLOAD_FOLDER"]) + upload_stream_write(form.excel_file.data, uploaded_tmp_file_path) + + con = form.data.get("con") + database = ( + db.session.query(models.Database).filter_by(id=con.data.get("id")).one() + ) + excel_to_df_kwargs = { + "header": form.header.data if form.header.data else 0, + "index_col": form.index_col.data, + "mangle_dupe_cols": form.mangle_dupe_cols.data, + "skipinitialspace": form.skipinitialspace.data, + "skiprows": form.skiprows.data, + "nrows": form.nrows.data, + "sheet_name": form.sheet_name.data, + "chunksize": 1000, + } + df_to_sql_kwargs = { + "name": excel_table.table, + "if_exists": form.if_exists.data, + "index": form.index.data, + "index_label": form.index_label.data, + "chunksize": 1000, + } + database.db_engine_spec.create_table_from_excel( + uploaded_tmp_file_path, + excel_table, + database, + excel_to_df_kwargs, + df_to_sql_kwargs, + ) + + # Connect table to the database that should be used for exploration. + # E.g. if hive was used to upload a excel, presto will be a better option + # to explore the table. + expore_database = database + explore_database_id = database.get_extra().get("explore_database_id", None) + if explore_database_id: + expore_database = ( + db.session.query(models.Database) + .filter_by(id=explore_database_id) + .one_or_none() + or database + ) + + sqla_table = ( + db.session.query(SqlaTable) + .filter_by( + table_name=excel_table.table, + schema=excel_table.schema, + database_id=expore_database.id, + ) + .one_or_none() + ) + + if sqla_table: + sqla_table.fetch_metadata() + if not sqla_table: + sqla_table = SqlaTable(table_name=excel_table.table) + sqla_table.database = expore_database + sqla_table.database_id = database.id + sqla_table.user_id = g.user.id + sqla_table.schema = excel_table.schema + sqla_table.fetch_metadata() + db.session.add(sqla_table) + db.session.commit() + except Exception as ex: # pylint: disable=broad-except + db.session.rollback() + try: + os.remove(uploaded_tmp_file_path) + except OSError: + pass + message = _( + 'Unable to upload Excel file "%(filename)s" to table ' + '"%(table_name)s" in database "%(db_name)s". ' + "Error message: %(error_msg)s", + filename=form.excel_file.data.filename, + table_name=form.name.data, + db_name=database.database_name, + error_msg=str(ex), + ) + + flash(message, "danger") + stats_logger.incr("failed_excel_upload") + return redirect("/exceltodatabaseview/form") + + os.remove(uploaded_tmp_file_path) + # Go back to welcome page / splash screen + message = _( + 'CSV file "%(excel_filename)s" uploaded to table "%(table_name)s" in ' + 'database "%(db_name)s"', + excel_filename=form.excel_file.data.filename, + table_name=str(excel_table), + db_name=sqla_table.database.database_name, + ) + flash(message, "info") + stats_logger.incr("successful_excel_upload") + return redirect("/tablemodelview/list/")