This is a quick and dirty post on how I deal with ugly Excel as a source in response to this
StackOverflow question. Originally, I built this script to address some data I needed to import that was stored in Excel. It wasn't typical Excel data of rows and columns. No, it was a repeating report that was exported to Excel. There was no way in hell I was going to be able to make the Excel driver work with that data to get it imported. I recently had cause to bring this code out to parse a file that will be consistent with the layout but the column headers will change between sheets. It's a forecasting spreadsheet so tab 1 is people, table 2 is dollars, tab 3 is hours, etc.
Script Task
You will need to define your output columns and their data types and then go in and add 2 classes to the existing project which are described below.
Main
This is used in the Script Task's Main section. There is probably a more graceful way of doing this but it works. In my PreExecute, I instantiate a DataSet by calling the static method ReadExcelQuery. This fills a "raw" dataset, which is literally the data exactly out of Excel. I then pass the raw dataset into the ParseSample method and get a different dataset out. In this example, there were two sheets in the Excel Workbook that I needed to get the data out of. After the PreExecute has completed, my member variable parsedData is populated with a few data tables. There is a raw, which I find useful for seeing the data exactly as it was extracted from Excel. This was useful when I was running into dates which were really just integers (see the ConvertXlsdtToDateTime method). There will be a parsed data table which is populated from your specific Parse method. Finally, there is an Error data table which lists the row & column that didn't parse as expected.
Given the fun that is debugging in SSIS Script tasks, I strongly encourage you to create a .NET Console project. Inside that project's main, you can basically paste the PreExecute method's code in there and then work on getting your Parse logic squared away. Once that's done, then add them into the actual SSIS Script Task. At that point, you're simply wiring up the resultant dataset into the OuputBuffer.
1: using System;
2: using System.Data;
3: using System.Collections.Generic;
4: using Microsoft.SqlServer.Dts.Pipeline.Wrapper;
5: using Microsoft.SqlServer.Dts.Runtime.Wrapper;
6:
7: /// <summary>
8: /// This is the class to which to add your code. Do not change the name, attributes, or parent
9: /// of this class.
10: /// </summary>
11: [Microsoft.SqlServer.Dts.Pipeline.SSISScriptComponentEntryPointAttribute]
12: publicclass ScriptMain : UserComponent
13: {
14: DataSet parsedData;
15:
16: publicoverridevoid PreExecute()
17: {
18: base.PreExecute();
19: this.parsedData = new DataSet();
20:
21: string fileName = this.Variables.CurrentFileName;
22: DataSet ds = null;
23: string worksheetName = string.Empty;
24: bool hasHeader = false;
25: DataSet excel = null;
26:
27: List<string> worksheets = new List<string>() { "Sheet1$", "Sheet2$"};
28: foreach (string worksheet in worksheets)
29: {
30: excel = ExcelReader.ReadExcelQuery(fileName, worksheet, hasHeader);
31: if (ds == null)
32: {
33: ds = ExcelParser.ParseSample(FileFormatter.Sample, excel, worksheet);
34: }
35: else
36: {
37: ds.Merge(ExcelParser.ParseSample(FileFormatter.Sample, excel, worksheet));
38: }
39: }
40:
41: this.parsedData = ds.Copy();
42: }
43:
44: /// <summary>
45: /// Slog through parsedData and emit
46: /// </summary>
47: publicoverridevoid CreateNewOutputRows()
48: {
49: bool pbCancel = false;
50: foreach (DataRow item inthis.parsedData.Tables["Sample_TYPE"].Rows)
51: {
52: try
53: {
54: SampleBuffer.AddRow();
55: SampleBuffer.Company = item["Company"].ToString();
56: SampleBuffer.Entity = item["Entity"].ToString();
57: SampleBuffer.Function = item["Function"].ToString();
58:
59: }
60: catch (Exception)
61: {
62:
63: throw;
64: }
65:
66: }
67:
68: foreach (DataRow item inthis.parsedData.Tables["ERROR"].Rows)
69: {
70: ErrorOutputBuffer.AddRow();
71: ErrorOutputBuffer.entity = item["entity"].ToString();
72: ErrorOutputBuffer.severity = item["severity"].ToString();
73: ErrorOutputBuffer.messagecode = item["msg_cd"].ToString();
74: ErrorOutputBuffer.messagetext = item["msg_tx"].ToString();
75: ErrorOutputBuffer.value = item["value"].ToString();
76: ComponentMetaData.FireError(0, "Sample parser", item["value"].ToString(), string.Empty, 0, out pbCancel);
77: }
78:
79: }
80:
81: }
ExcelParser class
Here is where the guts of your coding belongs. You will work through that raw dataset in a row by row fashion. It's not the greatest but it works. You will also need to define what your intended output looks like in the "MockDataTable" method.
1: using System;
2: using System.Collections.Generic;
3: using System.Data;
4: using System.Data.OleDb;
5: using System.Data.SqlClient;
6: using System.Diagnostics;
7: using System.Linq;
8: using System.Text;
9: using System.Text.RegularExpressions;
10:
11: /// <summary>
12: /// This class is concerned with parsing data from raw datasets
13: /// </summary>
14: publicclass ExcelParser
15: {
16: /// <summary>
17: /// Value translates to Class.MethodName, used for logging
18: /// </summary>
19: privatestaticstring logFileName = System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString();
20:
21: /// <summary>
22: /// Prevents a default instance of ExcelParser class from being created.
23: /// </summary>
24: private ExcelParser()
25: {
26: }
27:
28: /// <summary>
29: /// Parse some Excel file
30: /// </summary>
31: /// <param name="fileFormat"></param>
32: /// <param name="rawData"></param>
33: /// <returns></returns>
34: publicstatic DataSet ParseSample(FileFormatter fileFormat, DataSet rawData, string workSheetName)
35: {
36: System.Data.DataSet ds = null;
37: System.Data.DataTable dt = null;
38: System.Data.DataTable errors = null;
39: System.Data.DataRow newRow = null;
40:
41: // monotomically increasing number
42: int rowNumber = 0;
43:
44: // Key to understand when a new group has been found
45: string currentEntity = string.Empty;
46: string current = string.Empty;
47: decimal someValue = 0;
48:
49: // Error reporting
50: string errorSeverity = string.Empty;
51: string errorCode = string.Empty;
52: string errorMessage = string.Empty;
53: string errorValue = string.Empty;
54:
55: errorSeverity = "fatal";
56: errorCode = "invalid data type";
57: errorMessage = "The supplied value does not match expected type";
58: errors = ExcelParser.MockErrorTable();
59:
60: ds = new DataSet();
61: dt = ExcelParser.MockDataTable(fileFormat);
62: int outRowOrdinal = 15;
63: int outYearOrdinal = 16;
64: int outTabNameOrdinal = 17;
65: bool keepRow = true;
66: int processingYear = 0;
67:
68: // Parsing logic goes here
69: try
70: {
71: foreach (DataRow dr in rawData.Tables[0].Rows)
72: {
73: // Skip first 3 rows
74: newRow = dt.NewRow();
75: if (rowNumber < 2)
76: {
77: rowNumber++;
78: continue;
79: }
80:
81: // On the third row, we need to peel out the year
82: if (rowNumber == 2)
83: {
84: // Jan 13 Forecast
85: current = dr[3].ToString().Split()[1];
86: processingYear = Int32.Parse(current) + 2000;
87: rowNumber++;
88: continue;
89: }
90:
91: // Keep track of the forecast amount as we only store rows which
92: // are not zeroed out.
93: keepRow = false;
94:
95: for (int i = 0; i < 15; i++)
96: {
97: current = dr[i].ToString();
98: switch (i)
99: {
100:
101: case 0:
102: // Company
103: newRow[i] = current;
104: break;
105: case 1:
106: // Department
107: if (!string.IsNullOrEmpty(current))
108: {
109: currentEntity = current.Trim();
110: }
111: newRow[i] = currentEntity;
112: break;
113: case 2:
114: // Division
115:
116: newRow[i] = current.Trim();
117: break;
118: default:
119: if (!Decimal.TryParse(current, out someValue))
120: {
121: errorValue = string.Format(
122: "Unable to convert {0} to an decimal. '{3}' ({1}{2})",
123: current,
124: ExcelParser.GetExcelColumnName(i),
125: rowNumber,
126: workSheetName);
127: errors.Rows.Add(newobject[] { fileFormat.ToString(), errorSeverity, errorCode, errorMessage, errorValue });
128: }
129: else
130: {
131: newRow[i] = someValue;
132: if (someValue != 0)
133: {
134: keepRow = true;
135: }
136: }
137:
138: break;
139: }
140: }
141:
142: newRow[outRowOrdinal] = rowNumber++;
143: newRow[outTabNameOrdinal] = workSheetName.TrimEnd(new[] { '$' });
144: newRow[outYearOrdinal] = processingYear;
145: if (keepRow)
146: {
147: dt.Rows.Add(newRow);
148: }
149: }
150: }
151: catch (Exception ex)
152: {
153: errorValue = string.Format(
154: "{0} {4} '{3}' {1}{2}",
155: logFileName,
156: ExcelParser.GetExcelColumnName(0),
157: rowNumber,
158: workSheetName,
159: ex.Message);
160: //LogUtil.Error(logFileName, errorValue, ex);
161: errors.Rows.Add(newobject[] { fileFormat.ToString(), errorSeverity, "UnableToParse", "Unable to process file. Contact development", errorValue });
162: }
163:
164: ds.Tables.Add(dt);
165: ds.Tables.Add(errors);
166:
167: //LogUtil.Debug(System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(), string.Format("{0} Leaving method", ExcelParser.logFileName));
168:
169: return ds;
170: }
171:
172: /// <summary>
173: /// Method to generate the format for an error table
174: /// </summary>
175: /// <returns>A template for error messages</returns>
176: publicstatic DataTable MockErrorTable()
177: {
178: DataTable dt = null;
179: dt = new DataTable("ERROR");
180: dt.Columns.Add("entity", System.Type.GetType("System.String"));
181: dt.Columns.Add("severity", System.Type.GetType("System.String"));
182: dt.Columns.Add("msg_cd", System.Type.GetType("System.String"));
183: dt.Columns.Add("msg_tx", System.Type.GetType("System.String"));
184: dt.Columns.Add("value", System.Type.GetType("System.String"));
185: return dt;
186: }
187:
188: /// <summary>
189: /// Translate a number into its equivalent Excel column name. e.g. 0 -> A; 2 -> C
190: /// </summary>
191: /// <remarks>http://stackoverflow.com/questions/181596/how-to-convert-a-column-number-eg-127-into-an-excel-column-eg-aa</remarks>
192: /// <param name="columnNumber">A zero based number that maps to an excel column ordinal</param>
193: /// <returns>A string of letters that are Excel column names</returns>
194: publicstaticstring GetExcelColumnName(int columnNumber)
195: {
196: int dividend = columnNumber + 1;
197: string columnName = String.Empty;
198: int modulo;
199:
200: while (dividend > 0)
201: {
202: modulo = (dividend - 1) % 26;
203: columnName = Convert.ToChar(65 + modulo).ToString() + columnName;
204: dividend = (int)((dividend - modulo) / 26);
205: }
206:
207: return columnName;
208: }
209:
210: /// <summary>
211: /// Seriously? For the loss
212: /// <see cref="http://www.debugging.com/bug/19252"></see>
213: /// </summary>
214: /// <param name="excelDate">Number of days since 1900-01-01</param>
215: /// <returns>The converted days to date</returns>
216: publicstatic DateTime ConvertXlsdtToDateTime(int excelDate)
217: {
218: DateTime dt = new DateTime(1899, 12, 31);
219:
220: // adjust for 29 Feb 1900 which Excel considers a valid date
221: if (excelDate >= 60)
222: {
223: excelDate--;
224: }
225:
226: return dt.AddDays(excelDate);
227: }
228:
229: /// <summary>
230: /// In an attempt to simplify the actual parsing code, I have moved the
231: /// table creation and column definition into this method.
232: /// </summary>
233: /// <param name="fileFormat">The appropriate fileformat to use</param>
234: /// <returns>A well formed datatable for the given input type</returns>
235: privatestatic DataTable MockDataTable(FileFormatter fileFormat)
236: {
237: DataTable dt = null;
238:
239: switch (fileFormat)
240: {
241: case FileFormatter.Sample:
242: dt = new System.Data.DataTable("Sample_TYPE", "tableNamespace");
243: dt.Columns.Add("Company", System.Type.GetType("System.String"));
244: dt.Columns.Add("Entity", System.Type.GetType("System.String"));
245: dt.Columns.Add("Function", System.Type.GetType("System.String"));
246: break;
247: default:
248: dt = new System.Data.DataTable("Unknown", "tableNamespace");
249: dt.Columns.Add("row_id", System.Type.GetType("System.Int32"));
250: dt.Columns.Add("name", System.Type.GetType("System.String"));
251: dt.Columns.Add("value", System.Type.GetType("System.Decimal"));
252: break;
253: }
254:
255: return dt;
256: }
257: }
258:
259: /// <summary>
260: /// The specific file formatters.
261: /// </summary>
262: publicenum FileFormatter
263: {
264: /// <summary>
265: /// Budgeting file
266: /// </summary>
267: Sample,
268: }
ExcelReader class
This class is responsible for querying columns out of an Excel as a dataset. A worksheet has a $ at the end of its name which isn't visible in Excel. This also works with named ranges which won't have a trailing $.
1: using System;
2: using System.Collections.Generic;
3: using System.Data;
4: using System.Data.OleDb;
5: using System.Data.SqlClient;
6: using System.Diagnostics;
7: using System.Linq;
8: using System.Text;
9: using System.Text.RegularExpressions;
10:
11: /// <summary>
12: /// This class is concerned with extracting data from non-tabular Excel
13: /// documents.
14: /// </summary>
15: publicclass ExcelReader
16: {
17: /// <summary>
18: /// Value translates to Class.MethodName, used for logging
19: /// </summary>
20: privatestaticstring logFileName = System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString();
21:
22: /// <summary>
23: /// Template for an OLEDB provider to read Excel data
24: /// </summary>
25: privatestaticstring providerTemplate = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=\"{0}\";Extended Properties=\"Excel 8.0;HDR={1};IMEX=1\"";
26:
27: /// <summary>
28: /// Template for an OLEDB provider to read Excel data
29: /// </summary>
30: privatestaticstring providerTemplateXLSX = "Provider=Microsoft.ACE.OLEDB.12.0;Data Source=\"{0}\";Extended Properties=\"Excel 12.0 Xml;HDR={1};IMEX=1\"";
31:
32: /// <summary>
33: /// The row id column will always be the last element in the unparsed data
34: /// </summary>
35: privatestaticstring rowIdColumnName = "row_id";
36:
37: /// <summary>
38: /// Prevents a default instance of the ExcelReader class from being created.
39: /// Static methods so constructor is hidden
40: /// </summary>
41: private ExcelReader()
42: {
43: }
44:
45: /// <summary>
46: /// Gets the row id column name
47: /// </summary>
48: publicstaticstring RowIdColumnName
49: {
50: get { return ExcelReader.rowIdColumnName; }
51: }
52:
53: /// <summary>
54: /// Utility method for pulling base excel data into a dataset
55: /// </summary>
56: /// <param name="filePath">Base folder for a file</param>
57: /// <param name="fileName">actual file name</param>
58: /// <param name="worksheetName">The name of the Excel worksheet</param>
59: /// <param name="hasHeader">Boolean indicating whether there is a header row</param>
60: /// <returns>A dataset representation of a spreadsheet</returns>
61: publicstatic DataSet ReadExcelQuery(string filePath, string fileName, string worksheetName, bool hasHeader)
62: {
63: return ExcelReader.ReadExcelQuery(System.IO.Path.Combine(filePath, fileName), worksheetName, hasHeader);
64: }
65:
66: /// <summary>
67: /// Utility method for pulling base excel data into a dataset
68: /// </summary>
69: /// <param name="fileName">The filename to work against</param>
70: /// <param name="worksheetName">The name of the Excel worksheet</param>
71: /// <param name="hasHeader">Boolean indicating whether there is a header row</param>
72: /// <returns>Dataset with raw dataset</returns>
73: publicstatic DataSet ReadExcelQuery(string fileName, string worksheetName, bool hasHeader)
74: {
75: DataSet dataSet = new DataSet();
76: string datatableName = string.Empty;
77: string excelConnectionString = string.Empty;
78: string query = string.Empty;
79: datatableName = "RawData";
80: query = string.Format("SELECT * FROM [{0}]", worksheetName);
81:
82: excelConnectionString = string.Format(ExcelReader.providerTemplate, fileName, hasHeader ? "YES" : "NO");
83:
84: //Console.WriteLine(string.Format("{0} Processing file {1} Query {2}", logFileName, fileName, query));
85: //Console.WriteLine(string.Format("{0} excelConnectionString {1}", logFileName, fileName));
86: //Console.WriteLine(string.Format("{0} excelConnectionString {1}", logFileName, excelConnectionString));
87:
88: // Ensure file exists
89: if (!System.IO.File.Exists(fileName))
90: {
91: //Console.WriteLine(string.Format("FileNotFound! Expected fileName: {0}", logFileName, fileName),
92: // new System.IO.FileNotFoundException("FileNotFount exception", fileName));
93: return dataSet;
94: }
95:
96: try
97: {
98: OleDbDataAdapter oleDbDataAdapter = new OleDbDataAdapter(
99: query, excelConnectionString);
100:
101: dataSet.Tables.Add(datatableName);
102: oleDbDataAdapter.Fill(dataSet.Tables[datatableName]);
103:
104: // add a column in for record id
105: DataColumn rowIdColumn = new DataColumn(ExcelReader.rowIdColumnName, System.Type.GetType("System.Int32"));
106:
107: dataSet.Tables[datatableName].Columns.Add(rowIdColumn);
108:
109: // populate record id
110: for (int i = 0; i < dataSet.Tables[datatableName].Rows.Count; i++)
111: {
112: dataSet.Tables[datatableName].Rows[i][ExcelReader.rowIdColumnName] = i + 1;
113: if (hasHeader)
114: {
115: // TODO: fix this dreck
116: // This section is all wrong and untested code
117: if (i == 0)
118: {
119: i++;
120: }
121:
122: dataSet.Tables[datatableName].Rows[i][ExcelReader.rowIdColumnName] = i + 1;
123: }
124: }
125:
126: //LogUtil.Info(logFileName, string.Format("Dataset created for file: {0}", fileName));
127: return dataSet;
128: }
129: catch (System.IO.IOException ex)
130: {
131: // Most likely the file is locked by another process
132: //LogUtil.Error(logFileName, string.Format("Exclusive lock on file: {0} Error: {1}", fileName, ex.ToString()));
133: returnnew DataSet();
134: }
135: catch (System.Data.OleDb.OleDbException ex)
136: {
137: // The tab name does not exist in the spreadsheet.
138: // Odds are decent they forgot to append a $ to the tab name
139: //Console.Error.WriteLine(string.Format("Unable to find tab {2} in file {0} Error: {1}", fileName, ex.ToString(), worksheetName));
140:
141: if (!worksheetName.EndsWith("$"))
142: {
143: //LogUtil.Error(logFileName, string.Format("Retrying as tab {1}$ in file {0}", fileName, worksheetName));
144: return ExcelReader.ReadExcelQuery(fileName, worksheetName + "$", hasHeader);
145: }
146:
147: returnnew DataSet();
148: }
149: catch (Exception ex)
150: {
151: //LogUtil.Error(logFileName, string.Format("Reading inital dataset failed: {0} Error: {1}", fileName, ex.ToString()));
152: returnnew DataSet();
153: }
154: }
155: }