diff --git a/QID-1208-MVAdiscbh/MVAdiscbh.py b/QID-1208-MVAdiscbh/MVAdiscbh.py new file mode 100644 index 0000000..ff45f9a --- /dev/null +++ b/QID-1208-MVAdiscbh/MVAdiscbh.py @@ -0,0 +1,70 @@ +import pandas as pd +import numpy as np +from scipy.cluster import hierarchy +import random +import matplotlib.pyplot as plt + +data = pd.read_csv("bostonh.dat", sep = "\s+", header=None) + +# transform data + +xt = data +for i in [0, 2, 4, 5, 7, 8, 9, 13]: + xt.iloc[:, i] = np.log(data.iloc[:, i]) + +xt.iloc[:, 1] = data.iloc[:, 1]/10 +xt.iloc[:, 6] = (data.iloc[:, 6]**(2.5))/10000 +xt.iloc[:, 10] = np.exp(0.4 * data.iloc[:, 10])/1000 +xt.iloc[:, 11] = data.iloc[:, 11]/100 +xt.iloc[:, 12] = np.sqrt(data.iloc[:, 12]) +data = xt.drop(3, axis = 1) + +da = (data - np.mean(data))/np.std(data, ddof = 1) +d = np.zeros([len(da),len(da)]) + + +for i in range(0, len(da)): + for j in range(0, len(da)): + d[i, j] = np.linalg.norm(da.iloc[i, :] - da.iloc[j, :]) + +ddd = d[1:, :-1][:, 0] +for i in range(1, len(da)-1): + ddd = np.concatenate((ddd, d[1:, :-1][i:, i])) + +w = hierarchy.linkage(ddd, 'ward') +tree = hierarchy.cut_tree(w, n_clusters = 2) + +da["tree"] = tree + +t1 = da[da["tree"] == 0].iloc[:, :-1] +t2 = da[da["tree"] == 1].iloc[:, :-1] + +m1 = t1.mean() # mean of first cluster +m2 = t2.mean() # mean of second cluster +m = (m1 + m2)/2 +s = ((len(t1) - 1) * t1.cov() + (len(t2) - 1) * t2.cov())/(len(xt) - 2) +alpha = np.linalg.inv(s) @ (m1 - m2) + +mis1 = ((t1 - m) @ alpha < 0).sum() +mis2 = ((t2 - m) @ alpha > 0).sum() + +corr1 = ((t1 - m) @ alpha > 0).sum() +corr2 = ((t2 - m) @ alpha < 0).sum() + +aper = (mis1 + mis2)/len(xt) +alph = (da.iloc[:, :-1] - np.tile(np.array(m), (len(da), 1))) @ alpha + +random.seed(1) + +p = tree.flatten() + (0.05*np.random.normal(size = len(tree))) + +dav = pd.DataFrame(data = {"tree": tree.flatten(), "p": p, "alph": alph}) + +fig, ax = plt.subplots(figsize = (10, 10)) +ax.scatter(dav[dav["tree"] == 0]["alph"], dav[dav["tree"] == 0]["p"], c = "r") +ax.scatter(dav[dav["tree"] == 1]["alph"], dav[dav["tree"] == 1]["p"], c = "k", + marker = "^") +ax.axvline(0, c = "k") +plt.title("Discrimination scores", fontsize = 16) +plt.show() + diff --git a/QID-1208-MVAdiscbh/MVAdiscbh_python.png b/QID-1208-MVAdiscbh/MVAdiscbh_python.png new file mode 100644 index 0000000..2b26483 Binary files /dev/null and b/QID-1208-MVAdiscbh/MVAdiscbh_python.png differ diff --git a/QID-1208-MVAdiscbh/Metainfo.txt b/QID-1208-MVAdiscbh/Metainfo.txt index c7ac94c..4335a82 100644 --- a/QID-1208-MVAdiscbh/Metainfo.txt +++ b/QID-1208-MVAdiscbh/Metainfo.txt @@ -10,9 +10,11 @@ See also: MVAaer, MVAaper, MVAaerbh, MVAdisfbank, MVAdisnorm Author: Zografia Anastasiadou Author[SAS]: Svetlana Bykovskaya +Author[Python]: 'Matthias Fengler, Liudmila Gorkun-Voevoda' Submitted: Thu, August 04 2011 by Awdesch Melzer Submitted[SAS]: Wen, April 6 2016 by Svetlana Bykovskaya +Submitted[Python]: Tue, February 22 2022 by Liudmila Gorkun-Voevoda Datafile: bostonh.dat diff --git a/QID-1208-MVAdiscbh/README.md b/QID-1208-MVAdiscbh/README.md deleted file mode 100644 index fa39303..0000000 --- a/QID-1208-MVAdiscbh/README.md +++ /dev/null @@ -1,285 +0,0 @@ - -[Visit QuantNet](http://quantlet.de/) - -## [Visit QuantNet](http://quantlet.de/) **MVAdiscbh** [Visit QuantNet 2.0](http://quantlet.de/) - -```yaml - -Name of QuantLet : MVAdiscbh - -Published in : Applied Multivariate Statistical Analysis - -Description : 'Demonstrates maximum likelihood discrimination rule (ML rule) for the Boston housing -data.' - -Keywords : 'apparent-error-rate, discrimination, cluster-analysis, estimation, -discriminant-analysis, euclidean-distance-matrix, maximum-likelihood, plot, graphical -representation, financial, sas' - -See also : MVAaer, MVAaper, MVAaerbh, MVAdisfbank, MVAdisnorm - -Author : Zografia Anastasiadou - -Author[SAS] : Svetlana Bykovskaya - -Submitted : Thu, August 04 2011 by Awdesch Melzer - -Submitted[SAS] : Wen, April 6 2016 by Svetlana Bykovskaya - -Datafile : bostonh.dat - -Example : Discrimination scores for the two clusters created from the Boston housing data. - -``` - -![Picture1](MVAdiscbh.png) - -![Picture2](MVAdiscbh_sas.png) - - -### R Code: -```r - -# clear all variables -rm(list = ls(all = TRUE)) -graphics.off() - -# load data -data = read.table("bostonh.dat") - -# transform data -xt = data -xt[, 1] = log(data[, 1]) -xt[, 2] = data[, 2]/10 -xt[, 3] = log(data[, 3]) -xt[, 5] = log(data[, 5]) -xt[, 6] = log(data[, 6]) -xt[, 7] = (data[, 7]^(2.5))/10000 -xt[, 8] = log(data[, 8]) -xt[, 9] = log(data[, 9]) -xt[, 10] = log(data[, 10]) -xt[, 11] = exp(0.4 * data[, 11])/1000 -xt[, 12] = data[, 12]/100 -xt[, 13] = sqrt(data[, 13]) -xt[, 14] = log(data[, 14]) -data = xt[, -4] - -da = scale(data) # standardize variables -d = dist(da, "euclidean", p = 2) # euclidean distance matrix -w = hclust(d, method = "ward.D") # cluster analysis with ward algorithm -tree = cutree(w, 2) # define the clusters, tree=1 if cluster=1 - -# the following two lines under comments are for price of Boston houses -# tree=(xt[,14]>median(xt[,14]))+1 -# da=da[,1:12] - -t1 = subset(da, tree == 1) -t2 = subset(da, tree == 2) - -m1 = colMeans(t1) # mean of first cluster -m2 = colMeans(t2) # mean of second cluster -m = (m1 + m2)/2 # mean of both clusters -s = ((nrow(t1) - 1) * cov(t1) + (nrow(t2) - 1) * cov(t2))/(nrow(xt) - 2) # common variance matrix -alpha = solve(s) %*% (m1 - m2) # alpha for the discrimination rule - -# APER for clusters of Boston houses -mis1 = sum((t1 - m) %*% alpha < 0) # misclassified 1 -mis2 = sum((t2 - m) %*% alpha > 0) # misclassified 2 -corr1 = sum((t1 - m) %*% alpha > 0) # correct 1 -corr2 = sum((t2 - m) %*% alpha < 0) # correct 2 -aper = (mis1 + mis2)/nrow(xt) # APER (apparent error rate) -alph = (da - matrix(m, nrow(da), ncol(da), byrow = T)) %*% alpha -set.seed(1) - -# discrimination scores -p = cbind(alph, tree + 0.05 * rnorm(NROW(tree))) -tree[tree == 1] = 16 -tree[tree == 2] = 17 -tr = tree -tr[tr == 16] = "red" -tr[tr == 17] = "black" - -# plot of discrimination scores -plot(p[, 1], p[, 2], pch = tree, col = tr, xaxt = "n", yaxt = "n", xlab = "", ylab = "", - bty = "n") -abline(v = 0, lwd = 3) -title(paste("Discrimination scores")) - -``` - -### SAS Code: -```sas -* Import the data; -data bostonh; - infile '/folders/myfolders/data/bostonh.dat'; - input temp1-temp14; -run; - -proc iml; - * Read data into a matrix; - use bostonh; - read all var _ALL_ into datax; - close bostonh; - - xt = datax; - xt[, 1] = log(datax[, 1]); - xt[, 2] = datax[, 2]/10; - xt[, 3] = log(datax[, 3]); - xt[, 5] = log(datax[, 5]); - xt[, 6] = log(datax[, 6]); - xt[, 7] = (datax[, 7] ## (2.5))/10000; - xt[, 8] = log(datax[, 8]); - xt[, 9] = log(datax[, 9]); - xt[, 10] = log(datax[, 10]); - xt[, 11] = exp(0.4 * datax[, 11])/1000; - xt[, 12] = datax[, 12]/100; - xt[, 13] = sqrt(datax[, 13]); - xt[, 14] = log(datax[, 14]); - datax = xt[,1:3] || xt[,5:14]; - - create dat from datax[colname={"t1" "t2" "t3" "t4" "t5" "t6" "t7" "t8" "t9" "t10" "t11" "t12" "t13"}]; - append from datax; - close dat; - - create dat2 from xt[colname={"t1" "t2" "t3" "t4" "t5" "t6" "t7" "t8" "t9" "t10" "t11" "t12" "t13" "t14"}]; - append from xt; - close dat2; - -quit; - -* standardize the data matrix; -proc standard data = dat mean = 0 std = 1 out = ydat; - var t1-t13; -run; - -* euclidean distance matrix; -proc distance data = ydat out = dist method = euclid nostd; - var interval (t1--t13); -run; - -data newdist; - id + 1; - set dist; -run; - -* cluster analysis with ward algorithm; -ods graphics on; -proc cluster data = newdist(type = distance) - method = ward - plots(only) = (Pseudo Dendrogram(vertical)) - print = 0 - outtree = stat - noprint; - id id; - title 'Ward Dendrogram for standardized data'; -run; -ods graphics off; - -* define the clusters; -proc tree data = stat noprint out = sol nclusters = 2; - id id; -run; - -data t3; - set sol; - if CLUSTER = 1; -run; - -data t4; - set sol; - if CLUSTER = 2; -run; - -data ydat2; - id + 1; - set ydat; -run; - -proc iml; - * all data; - use ydat2; - read all var _ALL_ into main; - close ydat2; - - * cluster 1; - use t3; - read all var _ALL_ into r3; - close t3; - - * cluster 2; - use t4; - read all var _ALL_ into r4; - close t4; - - * original data; - use dat2; - read all var _ALL_ into z; - close dat2; - - idd1 = r3[,1]; - idd2 = r4[,1]; - - main = main[,2:14]; - d1 = main[idd1,]; - d2 = main[idd2,]; - - m1 = (d1[:,]); * mean of first cluster; - m2 = (d2[:,]); * mean of second cluster; - m = (m1 + m2) / 2; * mean of both clusters; - s = ((nrow(d1) - 1) * cov(d1) + (nrow(d2) - 1) * cov(d2))/(nrow(z) - 2); * common variance matrix; - alpha = inv(s) * (m1 - m2)`; * alpha for the discrimination rule; - - * APER for clusters of Boston houses; - mis1 = sum((d1 - m) * alpha < 0); * misclassified 1; - mis2 = sum((d2 - m) * alpha > 0); * misclassified 2; - corr1 = sum((d1 - m) * alpha > 0); * correct 1; - corr2 = sum((d2 - m) * alpha < 0); * correct 2; - aper = (mis1 + mis2)/nrow(z); * APER (apparent error rate); - alph = (main - repeat(m, nrow(main), 1)) * alpha; - - n = nrow(z); - z = (1:n)` || z; - - * marker for cluster 1; - do i = 1 to nrow(idd1); - z[idd1[i],1] = 1; - end; - - * marker for cluster 2; - do i = 1 to nrow(idd2); - z[idd2[i],1] = 2; - end; - - * discrimination scores; - t = 0.05 * randnormal(n, 0, 1); - do i = 1 to n; - t[i] = z[i,1] + t[i]; - end; - - p = z[,1] || alph || t; - - id = p[,1]; - x1 = p[,2]; - x2 = p[,3]; - - create plot var {"x1" "x2" "id"}; - append; - close plot; -quit; - -proc sgplot data = plot - noautolegend; - title 'Discrimination scores'; - scatter x = x1 y = x2 / colorresponse = id colormodel = (red blue) - markerattrs = (symbol = circlefilled); - refline 0 / axis = x lineattrs = (color = black thickness = 2); - xaxis display = none; - yaxis display = none; -run; - - - - - - -```