Mike M Eghtebas
Database Development, Web Programming
Sample Projects
Data Scraping:

Task: Over million rows of data per year for the span of two decades was required to be downloaded for water contamination analysis by an environmental consulting firm in Santa Monica, CA. The client, state of Louisiana, was expecting the consultant retrieve all of the data from their website.
Solution: After a brief analysis of the data, I did the following:
- Developed the relational table structure needed in SQL Server 2012 to maintain the data.
- Developed a Data Scraping module in ASP.net and C# using XPath to collect the drinking water data from the firm's website.
- Cleaned up the scraped data.
- Loaded the cleaned data into the relational tables
SQL Server tables and stored procedures (Data Scraping):
-- connect to swapeLAdata use swapeLAdata GO -- create LabSampleData create table LabSampleData( IDinAccess char(7) Not NULL , excludeYN bit default 0 , WaterSystemNo varchar(15) , WaterSystemName varchar(100) , PrincipalParishServed varchar(15) , [Status] varchar(5) , LabSampleNo varchar(15) , FederalType varchar(5) , StateType varchar(5) , PrimarySource varchar(15) , ActivityDate date , CollectionDate date , Note varchar(500) , [Year] nvarchar(4) , CONSTRAINT PK_IDinAccess PRIMARY KEY CLUSTERED ( [IDinAccess] ASC )) ON [PRIMARY] --alter table LabSampleData add [Year] nvarchar(4); -- create table LabAnalyteData create table LabAnalyteData( labAnalyteID int identity , IDinAccess char(7) , AnalyteCode varchar(10) , Analyte varchar(100) , MethodCode varchar(5) , LessThanIndicator varchar(5) , LevelType varchar(5) , ReportingLevel decimal(16,8) , ReportingLevelUnit varchar(5) , ConcentrationLevel decimal(16,8) , ConcentrationLevelUnit varchar(5) , MonitoringPeriodBeginDate date , MonitoringPeriodEndDate date , CONSTRAINT PK_labAnalyteID PRIMARY KEY (labAnalyteID) , CONSTRAINT FK_IDinAccess FOREIGN KEY (IDinAccess) REFERENCES LabSampleData(IDinAccess) ) GO /****** Object: Table [dbo].[LADATA2005-2015a] Script Date: 3/6/2015 3:24:28 AM ******/ SET ANSI_NULLS ON GO SET QUOTED_IDENTIFIER ON GO -- imported table from MS Access CREATE TABLE [dbo].[LADATA2005-2015a]( [ID] [int] NOT NULL, [IDinAccess] char(7) NULL, [DoneYN] [bit] NOT NULL, [IDinExcel] char(7) NULL, [Year] [nvarchar](4) NULL, [PrincipalCountyServed] [nvarchar](30) NULL, [WaterSystemNo] [nvarchar](30) NULL, [WaterSystemName] [nvarchar](100) NULL, [LabSampleNo] [nvarchar](255) NULL, [SiteURL] [nvarchar](max) NULL, [SampleType] [nvarchar](10) NULL, [CollectionDate] [nvarchar](10) NULL, [SamplingPoint] [nvarchar](30) NULL, [excludeYN] [bit] NOT NULL, [ActionCategory] [nvarchar](10) NULL, [htnlDoneOn] [datetime] NULL, [UploadDoneOn] [datetime] NULL ) ON [PRIMARY] TEXTIMAGE_ON [PRIMARY] go -- create ErrorTable create table ErrorTable( ErrID int identity , IDinAccess char(7) , ErrorCategory varchar(25) , ErrorDescription varchar(500) , [DoneYN] [bit] default 0 , Time_Stamp datetime ); GO Create Proc spAddToErrorTable @IDinAccess char(7) , @ErrorCategory varchar(25) , @ErrorDescription varchar(500) , @Time_Stamp datetime As Begin Insert ErrorTable(IDinAccess, ErrorCategory, ErrorDescription, Time_Stamp) Values(@IDinAccess, @ErrorCategory, @ErrorDescription, @Time_Stamp); end Create TYPE [dbo].[TVP_AnalyteData] AS TABLE( [IDinAccess] char(7) , AnalyteCode varchar(10) , Analyte varchar(100) , MethodCode varchar(5) , LessThanIndicator varchar(5) , LevelType varchar(5) , ReportingLevel decimal(16,8) , ReportingLevelUnit varchar(5) , ConcentrationLevel decimal(16,8) , ConcentrationLevelUnit varchar(5) , MonitoringPeriodBeginDate date , MonitoringPeriodEndDate date ) GO -- create sp_InsertLabSampleData to enter both sample and analyte data Create PROCEDURE [dbo].[sp_InsertLabSampleData] @IDinAccess char(7) -- single sample-specific data , @WaterSystemNo varchar(15) , @WaterSystemName varchar(100) , @PrincipalParishServed varchar(15) , @Status varchar(5) , @LabSampleNo varchar(15) , @FederalType varchar(5) , @StateType varchar(5) , @PrimarySource varchar(15) , @ActivityDate date , @CollectionDate date , @message VARCHAR(100) OUTPUT AS BEGIN Declare @ExistIDinAccess int; Declare @msg varchar(500); Select @ExistIDinAccess = Count(*) From LabSampleData Where IDinAccess=@IDinAccess; if (@ExistIDinAccess = 0) Begin -- insert sample-specific data to SampleData table INSERT INTO dbo.LabSampleData ( IDinAccess , WaterSystemNo , WaterSystemName , PrincipalParishServed , [Status] , LabSampleNo , FederalType , StateType , PrimarySource , ActivityDate , CollectionDate) Values ( @IDinAccess , @WaterSystemNo , @WaterSystemName , @PrincipalParishServed , @Status , @LabSampleNo , @FederalType , @StateType , @PrimarySource , @ActivityDate , @CollectionDate ); End Else Set @msg = @IDinAccess + ' already exists; the second attempt was ignored.'; if @@Error>0 begin if @msg ='' Set @msg = 'SQL Server error: ' + CAST(@@Error AS NVARCHAR(10)) else Set @msg = @msg + '; additionally there was a SQL Server error: ' + CAST(@@Error AS NVARCHAR(10)) end Set @message = @msg; Return @@Error END -- create sp_InsertSampleAnalyteData to enter both sample and analyte data Create PROCEDURE [dbo].[sp_InsertLabAnalyteData] @AnalyteTVP TVP_AnalyteData READONLY AS BEGIN --Select * From @AnalyteTVP; -- multiple records of Anaylte data to AnalyteData table INSERT INTO dbo.LabAnalyteData ( IDinAccess , AnalyteCode , Analyte , MethodCode , LessThanIndicator , LevelType , ReportingLevel , ReportingLevelUnit , ConcentrationLevel , ConcentrationLevelUnit , MonitoringPeriodBeginDate , MonitoringPeriodEndDate) SELECT IDinAccess , AnalyteCode , Analyte , MethodCode , LessThanIndicator , LevelType , ReportingLevel , ReportingLevelUnit , ConcentrationLevel , ConcentrationLevelUnit , MonitoringPeriodBeginDate , MonitoringPeriodEndDate FROM @AnalyteTVP; Return @@Error END select * from LabSampleData Where IDinAccess >'0135000'; select * from LabAnalyteData Where Analyte like 'Mtb%' ; Select * From ErrorTable; select * from LabAnalyteData Where IDinAccess >'0048923'; --delete from LabAnalyteData ;; --delete from LabSampleData Where IDinAccess ='0048924'; --delete from ErrorTable; Select e.IDinAccess,a.LabSampleNo, a.[year], a.PrincipalCountyServed, a.WaterSystemNo from ErrorTable as e inner join [LADATA2005-2015a] as a on e.IDinAccess=a.IDinAccess select s.[year], a.AnalyteCode, a.Analyte , Min(a.reportingLevel) as min_Reporting_Level , Max(a.reportingLevel) as max_Reporting_Level , a.ReportingLevelUnit as Reporting_Unit from LabAnalyteData as a inner join LabSampleData as s on s.IDinAccess= a.IDinAccess Where a.Analyte like 'METHYL TERT%' Group By s.[year], a.AnalyteCode, a.Analyte, a.ReportingLevelUnit Order By a.Analyte select s.IDinAccess,s.LabSampleNo, d.SamplingPoint ,s.[year], a.AnalyteCode, a.Analyte , a.reportingLevel , ReportingLevelUnit , a.ConcentrationLevel , a.ConcentrationLevelUnit , d.SiteURL from LabAnalyteData as a inner join LabSampleData as s on s.IDinAccess= a.IDinAccess inner join [LADATA2005-2015a] as d on d.IDinAccess=s.IDinAccess Where a.AnalyteCode = '2251' and s.[year] IN('2006', '2007', '2008') Order By s.[year] select AnalyteCode, Analyte from LabAnalyteData Where IDinAccess = '0009517' --Analyte like 'methyl tertiaryl%' Order By AnalyteCode select distinct ReportingLevelUnit from LabAnalyteData Order By ReportingLevelUnit select distinct ConcentrationLevelUnit from LabAnalyteData Order By ConcentrationLevelUnit Select d.IDinAccess, d.[Year], s.IDinAccess, s.[Year] From [LADATA2005-2015a] d inner join LabSampleData s on d.IDinAccess = s.IDinAccess UPDATE s SET s.[Year] = d.[Year] FROM LabSampleData as s INNER JOIN [LADATA2005-2015a] d ON d.IDinAccess = s.IDinAccess Select PrincipalCountyServed ,d.SiteURL ,d.SampleType ,d.SamplingPoint ,d.ActionCategory ,s.IDinAccess ,s.excludeYN ,s.WaterSystemNo ,s.WaterSystemName ,s.PrincipalParishServed ,s.[Status] ,s.LabSampleNo ,s.FederalType ,s.StateType ,s.PrimarySource ,s.ActivityDate ,s.CollectionDate ,s.Note ,s.[Year] ,a.AnalyteCode ,a.Analyte ,a.MethodCode ,a.LessThanIndicator ,a.LevelType ,a.ReportingLevel ,a.ReportingLevelUnit ,a.ConcentrationLevel ,a.ConcentrationLevelUnit ,a.MonitoringPeriodBeginDate ,a.MonitoringPeriodEndDate from [LADATA2005-2015a] d Inner join LabSampleData s on d.IDinAccess =s.IDinAccess Inner join LabAnalyteData a on a.IDinAccess =s.IDinAccess Where s.[Year] < '2006' Order By s.IDinAccess, a.[labAnalyteID]C# code, XPath, and others (Data Scraping):
using System; using HtmlAgilityPack; namespace _SWAPE { class Program { static void Main(string[] args) { HtmlDocument hdoc = new HtmlDocument(); HtmlNode abcTable; hdoc.Load("test1.html"); abcTable = hdoc.DocumentNode.SelectSingleNode("//table[@summary='ABC']"); foreach (HtmlNode row in abcTable.SelectNodes("tr")) { HtmlNodeCollection rowCells = row.SelectNodes("td"); if (rowCells != null) { for (int i = 0; i < rowCells.Count; i += 2) { Response.Write(String.Format("{0} | {1} |", rowCells[i].InnerText, rowCells[i + 1].InnerText)) } } Console.WriteLine(); } Console.ReadKey(); } } } -------------------------- using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.IO; using System.Linq; using System.Net; using System.Text; using System.Threading.Tasks; using System.Windows.Forms; using Excel = Microsoft.Office.Interop.Excel; namespace SWAPE_201502 { public partial class Form1 : Form { public Form1() { InitializeComponent(); } private void button1_Click(object sender, EventArgs e) { string urlAddress = "https://sdw.oph.dhh.la.gov/DWW/JSP/NonTcrSampleResultsGlobal.jsp?detectionFlag=Y&sample_number=%201364347&collection_date=%2001-15-2015&tinwsys_is_number=28&tinwsys_st_code=LA&tsasampl_is_number=278912&tsasampl_st_code=LA"; HttpWebRequest request = (HttpWebRequest)WebRequest.Create(urlAddress); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream receiveStream = response.GetResponseStream(); StreamReader readStream = null; if (response.CharacterSet == null) { readStream = new StreamReader(receiveStream); } else { readStream = new StreamReader(receiveStream, Encoding.GetEncoding(response.CharacterSet)); } string data = readStream.ReadToEnd(); //string htmlCode = client.DownloadString(urlAddress); System.IO.File.WriteAllText(@"C:\Users\Mike\Documents\RobHesse\Feb2015\test.txt", data); response.Close(); readStream.Close(); } private void button2_Click(object sender, EventArgs e) { //http://csharp.net-informations.com/excel/csharp-create-excel.htm Excel.Application xlApp = new Microsoft.Office.Interop.Excel.Application(); if (xlApp == null) { MessageBox.Show("Excel is not properly installed!!"); return; } Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object misValue = System.Reflection.Missing.Value; xlWorkBook = xlApp.Workbooks.Add(misValue); xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); xlWorkSheet.Cells[1, 1] = "Sheet 1 content"; xlWorkBook.SaveAs("C:\\Users\\Mike\\Documents\\RobHesse\\Feb2015\\csharp-Excel.xls", Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue); xlWorkBook.Close(true, misValue, misValue); xlApp.Quit(); releaseObject(xlWorkSheet); releaseObject(xlWorkBook); releaseObject(xlApp); MessageBox.Show("Excel file created , you can find the file C:\\Users\\Mike\\Documents\\RobHesse\\Feb2015\\csharp-Excel.xls"); } private void releaseObject(object obj) { try { System.Runtime.InteropServices.Marshal.ReleaseComObject(obj); obj = null; } catch (Exception ex) { obj = null; MessageBox.Show("Exception Occured while releasing object " + ex.ToString()); } finally { GC.Collect(); } } } }
Sample Websites:

Developed Performance Dashboard and KP ReConnect applications in ASP.net using C# and VB.net, respectively. These sites will be available for demo soon.
Kaiser Permanente Decision Support Application, Oracle backend:

-- Notes: -- Tags Words: Some comment fields are tagged to facilitate Slice & Dice system. -- In tDefinition, with category = 'SearchTag' there is a precompiled -- tag words (produced by Peggy). An stored procedure will be coded to -- be triggered (tbd later) to automatically tag these comment fields. -- The word tag code has been developed in VBA and will be converted to Oracle. -- -- Tag Word Store Procedure: -- Scans comment fields, and if a tag word is located, enters the matching ID value in -- -- tResponseSpecific.response_Id = tResponse.response_Id -- tResponseSpecific.definition_Id = tDefinition.definitionId -- tResponseSpecific.fieldName = "comment field name being searched" -- -- This procedure, in the meantime, is also expected to concatenate the tagged words -- and put it in tResponse.buildIssuesTag for example: -- tResponse.buildIssuesTag = the concatenated values returned by: Select Abbreviation From tDefinition -- inner join tResponseSpecific on definitionId = definition_Id Where -- tResponseSpecific.fieldName = 'buildIssuesTag' Order By sort; -- -- Note: This display of the tag words will be used to facilitate manual tagging or untagging later. -- create tPeople. -- users are expected to enter only tPeople.nuid, the rest of the information will be refreshed -- nightly or on demand using the existing tables in the Oracle database. CREATE TABLE tPeople ( peopleId number(6) not null, -- auto generated for internal use admin char(1) not null default 'N', submitter char(1) not null default 'N', operator char(1) not null default 'N', builder char(1) not null default 'N', nuid char(7) not null, firstName varchar2(50) null, lastName varchar2(50) null, mI varchar2(5) null, email varchar2(50) null, amailAlternate varchar2(50) null, address1 varchar2(50) null, address2 varchar2(50) null, City varchar2(50) null, State varchar2(25) null, zip varchar2(10) null, urlBusiness varchar2(100) null, urlPeronal varchar2(100) null, homephone varchar2(100) null, exclude char(7) not null, -- if Y retires, doesn't show int he combo boxes any more officephone char(12) null, cellPhone char(12) null, fax char(12) null, OldRequestId varchar(20) not null, -- ? not necessary after data conversion is compelted -- constraint ckAdmin check(admin in('Y', 'N')), constraint ckSubmitter check(submitter in('Y', 'N')), constraint ckOperator check(operator in('Y', 'N')), constraint ckBuilder check(builder in('Y', 'N')), constraint pk_tPeople primary key (peopleId), constraint ckNuid check(nuid like '[A-Z][0-9][0-9][0-9][0-9][0-9]') ) -- create tPosition -- one pesron may play multiptle roles in the same or more than one location -- individual people_Id/credential_Id/role_Id/group_Id/location_Id combination produces a unique -- positionId. These 5 fields need to be indexed to proven duplicate position for the same person.****************** -- the application admin to compose credential_Id/role_Id/group_Id/location_Id combination -- using dropboxes with rowsouce like "Select Abbreviation From tDefinition Where Category = 'Credential' -- Order By sort;" for example to make selection for the credential field.. CREATE TABLE tPosition ( positionId number(6) not null, -- auto generated for internal use credential_Id number(6) not null, -- matches tDefinition.definitionId role_Id number(6) not null, -- matches tDefinition.definitionId group_Id number(6) not null, -- matches tDefinition.definitionId location_Id number(6) not null, -- matches tDefinition.definitionId people_Id number(6) not null, -- matches tPepole.pepoleId comment varchar2(1000) null, -- constraint pk_tPosition primary key (positionId), constraint fk_tPosition_people_Id foreign key (people_Id) references tPeople(peopleId) ) -- create tRequest -- to avaoid dupplication request entry, we may be handle it by -- ndexing dsReqDescriptionTag. Also requestDate filed may have to be renamed -- to indicate when the record entry has been made. A single entery my be sponsored -- by different groups each with different request date,****************************** CREATE TABLE tRequest ( requestId number(6) not null, -- auto generated for internal use dsReqDescription varchar2(1000) not null, dsReqDescriptionTag varchar2(500) not null, -- see note titled "Tags Words" submitter_Id2 number(6) not null, -- ? sponsorMdLead_Id number(6) not null, -- matches tPosition.positionId submitter_Id number(6) not null, -- matches tPosition.positionId timeStamp date default SYSDATE, --stores date and time to the second lastEditedOn date, comment varchar2(1000) null, -- ? oldRequestId varchar2(25) null, -- ? not necessary afte data conversion is compelted tempReqId number(6) null, -- ? not necessary afte data conversion is compelted requestDate date not null, -- see notes for this table -- constraint pk_tRequest primary key (requestId), constraint fk_tRequest_submitter_Id foreign key (submitter_Id) references tPosition(positionId) ) -- create tPositionRequest -- Select definitionId, abbreviation from tDefinition Where Category = 'Hat' produces list of: -- Leader, co-leader, etc. to be pick by application admin thus storing -- tPositionRequest.hat_Id = tDefinition.definitionId along with -- tPositionRequest.position_Id = tPosition.positionId -- tPositionRequest.response_Id = tResponse.responseId -- these 3 fields (position_Id, response_Id, hat_Id) need to be indexed to proven -- duplicate position for the same person.******************************************** CREATE TABLE tPositionRequest ( positionRequestId number(6) not null, position_Id number(6) not null, -- matches tPosition.positionId response_Id number(6) not null, -- matches tResponse.responseId hat_Id number(6) not null, -- matches tDefinition.definitionId -- constraint pk_tPositionRequest primary key (positionRequestId), constraint fk_tPositionRequest_position_Id foreign key (position_Id) references tPosition(positionId), constraint fk_tPositionRequest_position_Id foreign key (response_Id) references tResponse(responseId) ) -- create tDefinition CREATE TABLE tDefinition ( definitionId number(6) not null, -- auto generated for internal use exclude char(1) not null default 'N', -- retire, don't show in the combo boxes any more sort varchar2(5) null default 'Z', abbreviation varchar2(25) not null, description varchar2(100) not null,, category varchar2(25) not null, subCategory varchar2(25) not null, comment varchar2(1000) null, -- ? could be used for Los_Angles for Los Angles to facilitate tag word search. lastEdited date null, -- ? userName varchar2(50) null, -- ? tempId number(6) null, -- ? -- constraint pk_tDefinition primary key (definitionId) ) -- create tResponse CREATE TABLE tResponse ( responseId number(6) not null, -- auto generated for internal use request_Id number(6) not null, -- matches tRequest.requestId oldRequestId varchar2(25) null, -- ? old_responseId varchar2(25) null, -- ? reviewDate date null, approvalDate date null, startDate date null, targetDate date null, specCallCompletedDate date null, concept_POC_Date date null, parentSrDate date null, buildCompletion_RESC_Date date null, buildCompletion_WITS_Date date null, buildCompletion_PSUP_Date date null, buildCompletion_PROD_Date date null, pilotDate date null, deploymentDate date null, status_Id number(6) not null, -- matches tDefinition.definitionId comment varchar2(1000) null, -- ? temp_sponsorshipGroup_10 varchar2(1000) null, -- ? requestorName_11 varchar2(1000) null, requestorTitle_12 varchar2(1000) null, dsReqPurpose_15 varchar2(1000) null, dsReqPurposeTag_15 varchar2(1000) null, currentPerformanceWoDs_16 varchar2(1000) null, currentPerformanceWoDsTag_16 varchar2(1000) null, dsPatientPopulation_17 varchar2(1000) null, dsPatientPopulationTag_17 varchar2(1000) null, dsmpactedUsr_19 varchar2(1000) null, dsmpactedUsrTag_19 varchar2(1000) null, dsImpactSpecialties_20 varchar2(1000) null, dsImpactSpecialtiesTag_20 varchar2(1000) null, dsRelatedToExistingDsTag_21 varchar2(1000) null, dsRelatedToExistingDs_21 varchar2(1000) null, dsMethodForSuccess_22 varchar2(1000) null, dsMethodForSuccessTag_22 varchar2(1000) null, dsPilot_23 varchar2(1000) null, dsPilotTag_23 varchar2(1000) null, dsDeployment_24 varchar2(1000) null, dsEducation_25 varchar2(1000) null, dsTraining_26 varchar2(1000) null, dsMonitor_27 varchar2(1000) null, dsMonitorTag_27 varchar2(1000) null, monitoringResultsResponseResponsiblility_28 varchar2(1000) null, submittedBy varchar2(50) null, cluster_Id number(6) null, -- matches tDefinition.definitionId altContactPersonTitle_30 varchar2(1000) null, -- ? dsRecommendationScore varchar2(1000) null, dsPanelRecommendation varchar2(1000) null, dsPanelComment varchar2(1000) null, dsIssuesForseen varchar2(1000) null, builder varchar2(1000) null, setting varchar2(1000) null, numericPriority char(1) null, buildDoc varchar2(1000) null, keyMessage varchar2(1000) null, talkingPoint varchar2(1000) null, specCallNeeded varchar2(1000) null, buildComment varchar2(1000) null, buildIssues varchar2(1000) null, buildIssuesTag varchar2(1000) null, releasedRegionallyComment varchar2(1000) null, releasedRegionallyCommentTag varchar2(1000) null, alignment varchar2(1000) null, proofOfConceptRequired char(1) not null default 'N', pasAlignmentRequired char(1) not null default 'N', pocRequired char(1) not null default 'N', temp_Requestor_Name varchar2(250) null, temp_Lead_MD varchar2(250) null, temp_Lead_MD_Title varchar2(250) null, temp_Submitter varchar2(250) null, temp_Submitter_Title varchar2(250) null, chkAgree char(1) not null default 'N', isDsRelatedToExistingDsYn char(1) not null default 'N', KnownInTheArea varchar2(1000) null, -- constraint pk_tResponse primary key (responseId), constraint fk_tResponse_request_Id foreign key (request_Id) references tRequest(requestId) ) -- create tResponseSpecific CREATE TABLE tResponseSpecific ( responseSpecificId number(6) not null, -- auto generated for internal use definition_Id number(6) not null, -- matches tDefinition.definitionId fieldName varchar2(250) null, -- comment varchar2(1000) null, --? response_Id number(6) not null, -- matches tResponse.responseId startPosition number(6) null, --? abbreviation varchar2(25) not null, --? description varchar2(100) not null, --? position_Id number(6) null, --? fieldValue varchar2(100) null, --? timeStamp date default sysdate, --? definition2_Id number(6) null, --? -- constraint pk_tResponseSpecific primary key (responseSpecificId), constraint fk_tResponseSpecific_response_Id foreign key (response_Id) references tResponse(responseId) )