Thursday, June 16, 2011

SSIS - Excel import with unknown number of columns

I have a bunch of xls files I want to import into a table. I don't know the name of the files, the name of the first Sheet, nor the amount of columns. So here is at least one way to do it.

Step 1: Foreach Loop Container
I'll use the Foreach Loop Container in the Control Flow to pick up each unknown files.

Step 1a: Add the Foreach Loop Container


Step 1b: Select the folder and set the Files to *.xls. I used an expression to set the directory so I can set that dynamically. I'm pulling it from a SQL table.


Step 1c: In the Variable Mappings, set a Variable to the 0 Index. This will be the full URI to the file.


Step 2: Get the name of the first Sheet to know what to pull in.

Step 2a: Add a Script Task inside the Foreach Loop Container in Control Flow.


Step 2b: Add Variables to be used in the script


*** UPDATE *** : If you want to add XLSX to this also, then you will need to download and install this driver from Microsoft. I've updated the code (2c and 3g) to include reference to this driver for XLSX files.

Step 2c: Add this script to get the first Sheet name.
using System;
using System.Data;
using Microsoft.SqlServer.Dts.Runtime;
using System.Windows.Forms;
using System.Xml;
using System.Data.OleDb;
using System.Diagnostics;

namespace ST_8f8a8b14140a4ded9dcbbf6aa5b97090.csproj
{
[System.AddIn.AddIn("ScriptMain", Version = "1.0", Publisher = "", Description = "")]
public partial class ScriptMain : Microsoft.SqlServer.Dts.Tasks.ScriptTask.VSTARTScriptObjectModelBase
{

#region VSTA generated code
enum ScriptResults
{
Success = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Success,
Failure = Microsoft.SqlServer.Dts.Runtime.DTSExecResult.Failure
};
#endregion

public void Main()
{
string excelFile;
string connectionString;
OleDbConnection excelConnection;
DataTable tablesInFile;
int tableCount = 0;
string currentTable;

try
{
Trace.WriteLine(string.Format("Sheet Name: {0} File Location: {1}", Dts.Variables["SheetName"].Value, Dts.Variables["ImportFilePath"].Value));
string extension = Path.GetExtension(excelFile);
extension = extension.Replace(".", "");

if (extension.ToLower() == "xlsx")
{
connectionString = "Provider=Microsoft.ACE.OLEDB.12.0;" +
"Data Source=" + excelFile + ";Extended Properties=Excel 12.0";
}
else
{
connectionString = "Provider=Microsoft.Jet.OLEDB.4.0;" +
"Data Source=" + excelFile + ";Extended Properties=Excel 8.0";
}
excelConnection = new OleDbConnection(connectionString);
excelConnection.Open();
tablesInFile = excelConnection.GetSchema("Tables");
tableCount = tablesInFile.Rows.Count;
if (tableCount > 0)
{
currentTable = tablesInFile.Rows[0]["TABLE_NAME"].ToString();

Dts.Variables["SheetName"].Value = currentTable;

Dts.TaskResult = (int)ScriptResults.Success;

Trace.WriteLine(string.Format("Sheet Name: {0}",Dts.Variables["SheetName"].Value));
}
else
{
Dts.Variables["ErrorCode"].Value = "Excel Sheet Lookup Failure. -- No Sheets Found.";
Dts.TaskResult = (int)ScriptResults.Failure;
}
}
catch (Exception e)
{
Dts.Variables["ErrorCode"].Value = "Excel Sheet Lookup Failure. -- " + e.Message;
throw new Exception("Excel Sheet Lookup Failure.", e);
}

}
}
}



Step 3: Create a Data Flow to handle importing the data in.

Step 3a: Add the data flow task. Then enter that data flow.


Step 3b : Add a Script Component - I finally went with the Script Component because I couldn't get an Excel Source Task to work consistantly. If the file had formating or had less columns than I preset in the Excel Source Task, I'd get issues.


Step 3c: Select Source as the script type


Step 3d: Add Variables with the file location and sheet name


Step 3e: No Connection Manager is needed if you are going to work with the imported data in the data flow before inserting it into a destination Connection Manager, like ADO SQL connection.


Step 3f: Add Output Fields - I added 25 String[DT_STR] fields 255 characters long.


*** UPDATE *** : As mentioned in the comments, if the code gets stuck on the excelReader.Read, then you may need to move the call to GetDataFromExcelToReader(Variables.ImportFilePath) from the PreExecute method to the CreateNewOutputRows method, just before the while loop.

Step 3g: Add this script
using System;
using System.Data;
using Microsoft.SqlServer.Dts.Pipeline.Wrapper;
using Microsoft.SqlServer.Dts.Runtime.Wrapper;
using System.IO;
using System.Data.SqlClient;
using System.Data.OleDb;
using System.Diagnostics;

[Microsoft.SqlServer.Dts.Pipeline.SSISScriptComponentEntryPointAttribute]
public class ScriptMain : UserComponent
{
private OleDbDataReader excelReader;
private OleDbConnection excelConnection;
private OleDbCommand excelCommand;

public override void PreExecute()
{
base.PreExecute();

// Open connection
GetDataFromExcelToReader(Variables.ImportFilePath);
}

public override void PostExecute()
{
base.PostExecute();

excelReader.Close();

excelConnection.Close();
}

private void GetDataFromExcelToReader(string p_strFileName)
{

string l_strConnectionString;

if (File.Exists(p_strFileName))
{
string extension = Path.GetExtension(p_strFileName);
extension = extension.Replace(".", "");

if (extension.ToLower() == "xlsx")
{
l_strConnectionString = "Provider=Microsoft.ACE.OLEDB.12.0;" +
"Data Source=" + p_strFileName + ";Extended Properties=\"Excel 12.0;HDR=NO;IMEX=1\";";
}
else
{
l_strConnectionString = "Provider=Microsoft.Jet.OLEDB.4.0;" +
"Data Source=" + p_strFileName + ";Extended Properties=\"Excel 4.0;HDR=NO;IMEX=1\";";
}
excelConnection = new OleDbConnection(l_strConnectionString);
excelConnection.Open();
excelCommand = excelConnection.CreateCommand();
excelCommand.CommandText = "SELECT * FROM [" + Variables.SheetName.ToString() + "]";
excelCommand.CommandType = CommandType.Text;
excelReader = excelCommand.ExecuteReader();

}
}

public override void CreateNewOutputRows()
{
while (excelReader.Read())
{
Output0Buffer.AddRow();
Output0Buffer.F1 = excelReader.FieldCount > 0 ? excelReader[0].ToString() : "";
Output0Buffer.F2 = excelReader.FieldCount > 1 ? excelReader[1].ToString() : "";
Output0Buffer.F3 = excelReader.FieldCount > 2 ? excelReader[2].ToString() : "";
Output0Buffer.F4 = excelReader.FieldCount > 3 ? excelReader[3].ToString() : "";
Output0Buffer.F5 = excelReader.FieldCount > 4 ? excelReader[4].ToString() : "";
Output0Buffer.F6 = excelReader.FieldCount > 5 ? excelReader[5].ToString() : "";
Output0Buffer.F7 = excelReader.FieldCount > 6 ? excelReader[6].ToString() : "";
Output0Buffer.F8 = excelReader.FieldCount > 7 ? excelReader[7].ToString() : "";
Output0Buffer.F9 = excelReader.FieldCount > 8 ? excelReader[8].ToString() : "";
Output0Buffer.F10 = excelReader.FieldCount > 9 ? excelReader[9].ToString() : "";
Output0Buffer.F11 = excelReader.FieldCount > 10 ? excelReader[10].ToString() : "";
Output0Buffer.F12 = excelReader.FieldCount > 11 ? excelReader[11].ToString() : "";
Output0Buffer.F13 = excelReader.FieldCount > 12 ? excelReader[12].ToString() : "";
Output0Buffer.F14 = excelReader.FieldCount > 13 ? excelReader[13].ToString() : "";
Output0Buffer.F15 = excelReader.FieldCount > 14 ? excelReader[14].ToString() : "";
Output0Buffer.F16 = excelReader.FieldCount > 15 ? excelReader[15].ToString() : "";
Output0Buffer.F17 = excelReader.FieldCount > 16 ? excelReader[16].ToString() : "";
Output0Buffer.F18 = excelReader.FieldCount > 17 ? excelReader[17].ToString() : "";
Output0Buffer.F19 = excelReader.FieldCount > 18 ? excelReader[18].ToString() : "";
Output0Buffer.F20 = excelReader.FieldCount > 19 ? excelReader[19].ToString() : "";
Output0Buffer.F21 = excelReader.FieldCount > 20 ? excelReader[20].ToString() : "";
Output0Buffer.F22 = excelReader.FieldCount > 21 ? excelReader[21].ToString() : "";
Output0Buffer.F23 = excelReader.FieldCount > 22 ? excelReader[22].ToString() : "";
Output0Buffer.F24 = excelReader.FieldCount > 23 ? excelReader[23].ToString() : "";
Output0Buffer.F25 = excelReader.FieldCount > 24 ? excelReader[24].ToString() : "";
}
}
}



Step 4: Use the data you've imported as needed. Massage it. Push it into SQL tables. Merge it with other data.