[e]
[e]
[e]

Data Warehousing with MySql

Data Warehousing is the idea of setting up a separate database structure for purely reporting purposes, so you can stop compromising between reporting and transactional demands.
MySql is a database that is fast, easy to use, and (generally) free.

> Aggregation
> Benchmarks
> ClickstreamData
> DatesAndTimes
> Home
> Indexing
> KPIs
> Links
> MathML
> Money
> MysqlCompared
> PhpPlugins
> Recipe
> Regression
> Scripting
> Sudoku
> SummaryData
> TableTypes
> TuningMysql
New Page:

search:

>
edit css
show debugging
contact: tom (.) cunningham (at) gmail (dot) com
[e]
[e]

###### regression.sql
###### Tom Cunningham <tom-dot-cunningham-at-gmail-dot-com> Fri-13-May-05
###### Functions for running a bivariate regression & getting output values.
###### Expected input: a table or view called "regression" with
###### columns: "datum" (=unique id), "independent", & "dependent".

####### Simple test regression data
#  drop table if exists simple_data;
#create table simple_data (datum int auto_increment primary key, independent float, dependent float);
#insert into simple_data (independent, dependent)
#values (1,4.1), (2,5.0), (3,6.1), (4,7), (5,8.1), (6,9), (7,10.1), (8,11), (9,10);
#  DROP VIEW IF EXISTS regression;
#CREATE VIEW regression AS
#SELECT * FROM simple_data;


###### get means
select @DEPENDENT_MEAN:= AVG( dependent ),
      @DEPENDENT_STD:= STD( dependent ),
      @DEPENDENT_VAR:= POW(STD( dependent ),2),
      @DEPENDENT_MAX:= MAX(dependent),

      @INDEPENDENT_MEAN:= AVG( independent ),
      @INDEPENDENT_STD:= STD( independent ),
      @INDEPENDENT_VAR:= POW(STD( independent ),2),

      @INDEPENDENT_SUM_SQRD:= SUM( independent * independent ),
      @SAMPLE:= COUNT(*)
 from regression;

###### beta (slope)
select @BETA:= SUM( (dependent - @DEPENDENT_MEAN ) * (independent - @INDEPENDENT_MEAN) )
              / SUM( POW( independent - @INDEPENDENT_MEAN,2 ) )
 from regression;

###### alpha (intersection)
SELECT @ALPHA:= @DEPENDENT_MEAN - @BETA * @INDEPENDENT_MEAN;

###### predicted values & residuals
select independent, dependent,
      @ALPHA + @BETA * independent AS predicted,
      @ALPHA + @BETA * independent - dependent AS error,
      REPEAT(IF((@ALPHA + @BETA * independent - dependent)>0,'+','-'),
             ABS(@ALPHA + @BETA * independent - dependent)) AS residual
 from regression
ORDER BY independent ASC;

###### Check that sum of residuals is 0
select round( sum( @ALPHA + @BETA * independent - dependent ), 3) from regression;

###### R-squared (coefficient of determination -- proportion of variance explained) (p107)
    # Cov(X,Y)^2 / (Var(X)*Var(Y))
SELECT @R_SQUARED:= POW( SUM( (dependent - @DEPENDENT_MEAN) * (independent - @INDEPENDENT_MEAN) ), 2)
                   / ( SUM( POW( independent - @INDEPENDENT_MEAN, 2) ) * SUM( POW( dependent - @DEPENDENT_MEAN, 2) ) )
 FROM regression;

###### Variance of regression (p107)
SELECT @VAR_REGRESSION:= SUM( POW( @ALPHA + @BETA * independent - dependent, 2 ) ) / (@SAMPLE - 2)
 FROM regression;

###### SER: Standard Error of Regression (p107)
SELECT @SER:= SQRT( @VAR_REGRESSION );

###### Std. Error for Beta = sqrt( var-regression / var-x )
SELECT @BETA_STD_ERROR:= SQRT( @VAR_REGRESSION / @INDEPENDENT_VAR );

###### Durbin-Watson (p120)
    # Measures serial correlation: if less than 1.5, positive serial correlation; if more than 2.5, then negative.
    # TODO: the quotient is not including the *first* error - should use left join
SELECT @DURBIN_WATSON:=
        SUM(
             POW( (@ALPHA + @BETA * r2.independent - r2.dependent)
                  - (@ALPHA + @BETA * r1.independent - r1.dependent)
                  ,2
             )
        )
      / SUM(
             POW( @ALPHA + @BETA * r2.independent - r2.dependent
                 ,2
             )
        ) AS dw
 FROM regression r2
 LEFT JOIN regression r1 ON r1.datum=r2.datum - 1;

###### dump all vars:
SELECT @SAMPLE,
      @ALPHA, @ALPHA_STD_ERROR,
      @BETA, @BETA_STD_ERROR,
      @R_SQUARED, @VAR_REGRESSION, @SER,
      @DURBIN_WATSON;


Total Time: 0.06s
REQUEST:Array
(
    [link_body/body] => {incl:Regression}
    [PHPSESSID] => b9a289e2959005ab5c102380940272d1
)
SESSION:Array
(
    [body/body] => {incl:Regression}
)
MESSAGES:PLUGINS: