H.A.C.K.

Az elso workshopon Orsi ezen a peldakodon vezetett minket - kb a feleig jutottunk. attacholva is a fajl, plusz egy bonusz basic statisztika anyag is. koszi Orsi!

R pelda kod

# Bevezeto duma:
# R kicsoda
# mire jo:
#    - statisztikai tesztek (klasszikus es modern)
#    - matematikai feladatok megoldasa (pl. diffegyenlet numerikus megoldasa)
#    - tablazatos adatok belolvasasa, manipulalasa
#    - abrarajzolas

# Kategorikus fuggetlen valtozok (faktorok)
# Pl.: halal idopontja, annak fuggvenyeben, hogy dohanyzik-e az illeto.
# (Nem valos adatok.)
y = c(rnorm(20, sd=.5)+65, rnorm(20, sd=.8)+70)
x = gl(2, 20, labels=c("Dohanyos","Nem dohanyos"))
plot(y, col=ifelse(x=="Dohanyos","red","blue"), xlab="Adatpontok indexe",
     ylab="Halal idopontja")
plot(y~x, notch=T, xlab="Dohanyzas",
     ylab="Halal idopontja")
barplot(c(mean(y[x=="Dohanyos"]), mean(y[x=="Nem dohanyos"])),
        xlab="Dohanyzas", ylab="Varhato elettartam",
        names=levels(x))

# Ezen a ponton nézzük meg a nyelv alapvetõ elemeit!
# (A teljesség igénye nélkül.)
# Vektor (a szam mind vektor)
numVector = 1
numVector = 1:12
numMatrix = matrix(data=numVector, nrow=3)
dim(numVector) = c(3,4)

# Vektorelemeknek ill. tomb dimenzióinak nevet is lehet adni
numVector = 1:12
names(numVector)
names(numVector) = c(paste(1:12, ".", sep=""))
dim(numVector) = c(3,4)
names(numVector)
dimnames(numVector)
dimnames(numVector)[[1]] = c(paste(1:dim(numVector)[1], ". sor", sep=""))
dimnames(numVector)[[2]] = c(paste(1:dim(numVector)[2], ". oszlop", sep=""))

# Stringekbol is lehet vektort csinálni ("character" a tipus neve)
strVector = c("Elso", "Masodik", "Harmadik", "Negyedik")
dim(strVector) = c(2, 2)

# Matrix sorai es oszlopai:
strVector
strVector[1,] # elso sora
strVector[,2] # masodik oszlopa

# Vektor indexalasa
numVector = 2*1:12
numVector
numVector[2]
numVector[2:5]
numVector[c(2, 4, 8)]

# Logikai tipus
myBools = c(T, F, T, F)
typeof(myBools)
# Indexalas logikai valtozokkal
numVector[myBools] # ! Ismetli a vektort, ha nem pont a megfelelo hosszu!
numVector[numVector%%3==0]  # !!
# Ugyanis:
numVector%%3==0

# Linearis illesztes
x = 0:100
# linearis fuggveny, random hibaval
y = -8+5*x + 100*runif(length(x))
plot(y~x)
abline(lm(y~x)) # lm a linearis illesztes

# Nemlinearis illesztes, ismert a strukturalis modell
x = 0:100
# Nemlinearis fuggveny, random hibaval
y = 11.8*x*exp(-0.03*x) + runif(length(x), min=-15.0, max=15.0)
plot(y~x)
# Itt tortenik az illesztes
model = nls(y~a*x*exp(-b*x),start=list(a=9.0, b=0.05))
lines(x, predict(model, list(x=x)))

# Nemlinearis illesztes, nincs ismert strukturalis modell
# "Regebbi, rosszabb"
lines(lowess(x, y), col="red")

# "Ujabb, jobb"
modelLoess = loess(y~x)
lines(x, predict(modelLoess, data.frame(x=x)), col="blue")

install.packages("deSolve")
library("deSolve")

# deSolve::ode : differencialegyenletek megoldasa
diffEq = function(t, state, params) {
  with(as.list(c(state, params)), {
    dA1 = -k12*A1 + k21*A2
    dA2 = k12*A1 - k21*A2 - k20*A2 
    list(c(dA1, dA2))
  }) # end of with(as.list...
}

state = c(A1=15, A2=0)
params = list(k12=0.5, k21=0.03, k20=0.2)
times = seq(0, 10, 0.1)

odeOutput = ode(y=state, times=times, func=diffEq, parms=params)
plot(odeOutput[,1], odeOutput[,2], type="l", xlab="Ido", ylab="Anyagmennyiseg")
lines(odeOutput[,1], odeOutput[,3], col="blue")

# Na, akkor beszeljunk megint a nyelvrol!
# Lista
# Akarhany, akarmilyen tipusu dolog lehet benne
myList = list(1, "Pityu", c(1, 2, 3, 4), matrix(data=c(1, 2, 3, 4), nr=2))
# Nevesiteni is lehet az elemeit
myOtherList = list(el1=1, el2="Pityu", el3=c(1, 2, 3, 4), el4=matrix(data=c(1, 2, 3, 4), nr=2))
# Utolag is lehet nekik nevet adni
names(myList) = c(paste("e", 1:length(myList), sep=""))
# Elemekre valo hivatkozas: nevvel v. indexxel
myList$e1
myList[1]
myList[[1]]
# Mivel nem erosen tipizalt a nyelv, ossze lehet kavarodni, nezzuk, ki milyen tipusu:
typeof(myList)
typeof(myList$e1)
typeof(myList[1])
typeof(myList[[1]])
typeof(myList[[2]])
typeof(myList[[3]])
typeof(myList[[4]])

# Erre csak akkor terjunk ki, ha kerdes van
l1 = list(a1=3, a2=8)
l2 = list(a2=4)
listVector = c(l1, l2)

# Fuggveny
myFun = function(par1, par2) {
  par1+par2^2
}
myFun(4)
myFun(4, 3)
myFun(par2=2, 4)

myFunWithDefault = function(par1, par2=2) {
  par1+par2^2
}
myFunWithDefault(4)
myFunWithDefault(4, 3)

myFunWithDots = function(par1, ...) {
  data = list(...)
  par1+length(data)
}
myFunWithDots(3)
myFunWithDots(3, 2, 1)
myFunWithDots(3, c(2, 1))
# pelda: paste

# Ha kerdes van: 'with'
paramList = list(par1 = 3, par2 = 4)
myFunWithoutWith = function(parList) {
  parList$par1+parList$par2
}
myFunWithoutWith(paramList)

# A nehezkes hivatkozas nelkul: nem jo
myFunWithoutWith = function(parList) {
  par1+par2
}
myFunWithoutWith(paramList)

# A 'with' leszedi a nehezkes hivatkozast
myFunWithWith = function(parList) {
  with(parList, par1+par2)
}
myFunWithWith(paramList)

# Elagazas, ciklus
# if, if-else, ifelse
# for, while, repeat (break)

x = 1:10
for (i in 1:length(x)) {
  if (x[i]%%2==0) {
    print(paste(x[i], "paros"))
  }
  else {
    print(paste(x[i], "paratlan"))
  }
}

print(ifelse(x%%2, "paros", "paratlan"))
print(paste(x, ifelse(x%%2, "paros", "paratlan")))

# data.frame (aki egy lista)
# Valojaban read.csv-vel vagy read.table-lel beolvassa az ember.
ids = sample(1:40)
y = c(rnorm(20, sd=.5)+65, rnorm(20, sd=.8)+70)
x = gl(2, 20, labels=c("Dohanyos","Nem dohanyos"))
doh = data.frame(ID=ids, Smoke=x, Death=y)
doh[order(doh$ID),]
doh[order(doh$Smoke,doh$ID),]
subset(doh, Smoke=="Dohanyos")
doh[doh$Smoke=="Dohanyos",]
subset(doh, Smoke=="Dohanyos", select=c("ID","Death"))
Gender = sample(gl(2, 20, labels=c("Ferfi", "No")))
doh1 = cbind(doh, Gender)

library("lattice")

with(doh1, bwplot(Death~Smoke|Gender))

# Hasznos dolgok (igy a vegere)
?paste   # man-lap
??par    # help search
demo(graphics)
demo(persp)
example(lm)
objects(grep("stat", search())) # pl. stat package osszes fuggvenye: akkor jo,
# ha tudod, hogy kell lenni valaminek, ami azt csinalja, amire gondolsz, de nem
# tudod, hogy mi a neve
rm(list=ls()) # Kipucolja a telepiszkitott workspace-t

Második alkalom - adatbányászat

R pelda kod

# Adatbanyaszat bevezetes (nagyon-nagyon alapveto)
# Kerdesfelvetes-tipusok:
# 1. Szivroham utan korhazban levo emberek eseten probaljuk megjosolni, hogy
#    lesz-e ujabb szivrohamuk a kovetkezo 2 evben, demografiai, klinikai es
#    eletmodbeli adatok alapjan.
# 2. Gazdasagi es egyeb mutatok alapjan probaljuk megjosolni egy bizonyos
#    tozsdeindex erteket adott ido mulva.
# 3. Bejovo email tulajdonsagai alapjan josoljuk meg, hogy spam-e.
# 4. Keressunk olyan mintazatokat emberek vasarlasi szokasaiban, amelyek
#    potencialisan kihasznalhatok reklam stb. celjaira.

# Tanulohalmaz (training set) -> szabalyszeruseg -> tesztadatokon megerosites
# -> ha jo, alkalmazas eles adatokra.

# Ha tudom, hogy a tanulohalmazomon mik a bemeneti adatok es mi a vart
# eredmeny: supervised learning problem.
# Ha magam sem tudom, hogy mit akarok, csak valami mintazatot talalni:
# unsupervised learning (4., esetleg 3. kerdes fent). Ezzel nem foglalkozunk.

# Supervised learning 2 alapveto tipusa: classification es regression.
# Ha a kimeno valtozo kategorikus, akkor osztalyozas, ha szam, akkor
# regresszio. (1. (3.), ill. 2. kerdes fent.)

# Supervised learning problemara 2 alapveto modszer: linearis modell es
# nearest neighbors.

# Linearis modell: alapveto feltetelezes, hogy a kimeno valtozo a bemeno
# valtozok valamely linearis kombinacioja, plusz egy intercept, es erre egy
# normaleloszlasu, 0 varhato erteku veletlen hiba jon meg ra.
# A bemeno valtozoimat persze transzformalhatom, ha ugy valoszinubb, hogy igaz
# ez a feltetelezes.
# Legkisebb negyzetek modzserevel keresi azokat az egyutthatokat, amelyek
# a vegso modellt irjak majd le. (Meg magyarazat!)

# Egy pelda, amire jo lesz a linearis modell:
# (Piros es kek pontok, 2 centrum korul csoportosulnak)

# 2 bemeno valtozo, kategorikus kimeno valtozo:

redXCentr = runif(1, min=1, max=100)
redYCentr = runif(1, min=1, max=100)
blueXCentr = runif(1, min=1, max=100)
blueYCentr = runif(1, min=1, max=100)

redXPts = rnorm(50, mean=redXCentr, sd=30) + rnorm(50, mean=0, sd=2)
redYPts = rnorm(50, mean=redYCentr, sd=30) + rnorm(50, mean=0, sd=2)
blueXPts = rnorm(50, mean=blueXCentr, sd=30) + rnorm(50, mean=0, sd=2)
blueYPts = rnorm(50, mean=blueYCentr, sd=30) + rnorm(50, mean=0, sd=2)

minX = min(c(redXPts, blueXPts))
minY = min(c(redYPts, blueYPts))
maxX = max(c(redXPts, blueXPts))
maxY = max(c(redYPts, blueYPts))

plot(redXPts, redYPts, xlim=c(minX, maxX), ylim=c(minY, maxY), col="red",
     xlab="Input1", ylab="Input2")
points(blueXPts, blueYPts, col="blue")

inputX = c(redXPts, blueXPts)
inputY = c(redYPts, blueYPts)
result = c(rep(0, length(redXPts)), rep(1, length(blueXPts)))

linModel = lm(result~inputX+inputY)
linModel
intr = as.numeric(linModel$coefficients[1])
coeffX = as.numeric(linModel$coefficients[2])
coeffY = as.numeric(linModel$coefficients[3])
# A hatarvonal kell nekem: intr + x*coeffX + y*coeffY == 0.5 =>
# y = (0.5-intr-x*coeffX)/coeffY

testX = 0:100
testY = (0.5-intr-testX*coeffX)/coeffY

lines(testX, testY)

hist(resid(linModel))

# Vegyünk egy masik peldat, ahol nem lesz igaz a feltetelezes:
# (Megint piros es kek pontok, de szigetes jelleggel)

redXCentrs = runif(4, min=1, max=100)
redYCentrs = runif(4, min=1, max=100)
blueXCentrs = runif(4, min=1, max=100)
blueYCentrs = runif(4, min=1, max=100)


redXPts = rnorm(48, mean=redXCentrs, sd=5) + rnorm(48, mean=0, sd=2)
redYPts = rnorm(48, mean=redYCentrs, sd=5) + rnorm(48, mean=0, sd=2)
blueXPts = rnorm(48, mean=blueXCentrs, sd=5) + rnorm(48, mean=0, sd=2)
blueYPts = rnorm(48, mean=blueYCentrs, sd=5) + rnorm(48, mean=0, sd=2)

minX = min(c(redXPts, blueXPts))
minY = min(c(redYPts, blueYPts))
maxX = max(c(redXPts, blueXPts))
maxY = max(c(redYPts, blueYPts))

plot(redXPts, redYPts, xlim=c(minX, maxX), ylim=c(minY, maxY), col="red",
     xlab="Input1", ylab="Input2")
points(blueXPts, blueYPts, col="blue")

# Ra lehet ereszteni a linearis modellt? Persze!
inputX = c(redXPts, blueXPts)
inputY = c(redYPts, blueYPts)
result = c(rep(0, length(redXPts)), rep(1, length(blueXPts)))

linModel = lm(result~inputX+inputY)
linModel
intr = as.numeric(linModel$coefficients[1])
coeffX = as.numeric(linModel$coefficients[2])
coeffY = as.numeric(linModel$coefficients[3])
# A hatarvonal kell nekem: intr + x*coeffX + y*coeffY == 0.5 =>
# y = (0.5-intr-x*coeffX)/coeffY

testX = 0:100
testY = (0.5-intr-testX*coeffX)/coeffY

lines(testX, testY)

hist(resid(linModel)) # Hoppa!

# Na, erre van a nearest neighbors, amit nem programoztam le.

# Probaljuk a lokalis liearis illeszteseket:
loessModel = loess(result~inputX+inputY)
# Lassuk, mit mutat:
testX = runif(100, minX, maxX)
testY = runif(100, minY, maxY)
predLoess = predict(loessModel, data.frame(inputX=testX, inputY=testY))

plot(redXPts, redYPts, xlim=c(minX, maxX), ylim=c(minY, maxY), col="red",
     xlab="Input1", ylab="Input2", pch=19)
points(blueXPts, blueYPts, col="blue", pch=19)

redTest = which(predLoess < 0.5)
points(testX[redTest], testY[redTest], col="red")
points(testX[-redTest], testY[-redTest], col="blue")

# Mirol erdemes meg erintolegesen beszelni?
# - Hogy valasszuk ki a fuggetlen valtozokat?
# - Hogy valasszuk ki a modellt?
# -> ha ezekkel "jatszom", akkor kulonosen fontos, hogy teszt adatokon is 
#    verifikaljam a modellem ervenyesseget.
# - Korrelalt fuggetlen valtozok: fokomponens-analizis, faktoranalizis
# - Fuggetlen valtozok nyilvan lehetnek transzformaltak
# - Interakciok

ID	Name	Comment	Uploaded	Size	Downloads
60	DataMiningIntro.r		stef Tue 16 of Nov, 2010 00:23 CET	5.44 Kb	632
54	RBasics.r	Bonusz alap statisztika	stef Wed 13 of Oct, 2010 19:15 CEST	20.11 Kb	670
53	pelda.R	Orsi anyaga	stef Wed 13 of Oct, 2010 19:15 CEST	6.03 Kb	626

Második alkalom - adatbányászat

Attached files

Log In

Upcoming Events