Commit c913fa1d authored by Bo's avatar Bo
Browse files

Creates six data processing function:

filterSequence: get the different arguments of the users for a specific
course.

removeReduplicatePart: reorganise list without reduplicate idPart in the
same idSession and keeps the original order
    
removeReduplicate: output the list which had removed the reduplication 

getCorrectPSDsequence: input the sequence which produced from Premium
Sequence function, output the sequence without reduplicate idPart and
has the correct duration.

averagePartLengthForACourse
gets the average length of session for a course

averageDurationForPart
get the average duration of the special part in a special course
parent bf498454
......@@ -14,55 +14,60 @@ import pickle
from asyncore import read
from lib2to3.pgen2.tokenize import Special
from pattern.metrics import duration
from symbol import argument
from operator import itemgetter
databaseManager = DatabaseManager()
scriptDatabaseManager = ScriptDatabaseManager()
class DataProcessing:
def averageDuration(self, idPart, sequence):
'''
get the average duration of the special part in a special course
'''
sum, ave, n = 0, 0, 0
sequence = self.splitSequence(sequence, 'idPart+duration', False)
for i in range(0,len(sequence)):
for j in range(0,len(sequence[i])):
if sequence[i][j][0] == idPart:
sum = sum + sequence[i][j][1]
n = n + 1
ave = float(sum)/float(n)
return ave
def splitSequence(self, sequence, argument, needToSave):
def filterSequence(self, sequence, argument, kind, needToSave):
'''
get the different arguments of the users for a specific course
return: a list [idPart, idPart, ... ,idPart], [idSession, idSession, ... ,idSession]
[duration, duration, ... ,duration] or [((idPart, duration),(idPart, duration)...)...((idPart, duration),(idPart, duration)...)]
return: a list [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
[(idSession, idSession, ... ,idSession),(idSession, idSession, ... ,idSession),...,(idSession, idSession, ... ,idSession)]
[(duration, duration, ... ,duration),(duration, duration, ... ,duration),(),...,(duration, duration, ... ,duration)]
[((idPart, duration),(idPart, duration)...),((),()...())...((idPart, duration),(idPart, duration)...)]
[(idUser,((idPart,duration),(idPart,duration)...)),...,(idUser,((idPart,duration),(idPart,duration)...))]
'''
list = []
list.append(sequence)
list.append(sequence)
(idCourse, firstSequence) = list[0]
userid = []; partid = []; sessionid = []; duration = [];
secondSequence = [] ; thirdSequence = []; fourthSequence = []; finalSequence = []
userid = []; partid = []; sessionid = []; duration = []; pid = []; sid = []; did = [];
secondSequence = [] ; thirdSequence = []; fourthSequence = [];
fifthSequence = []; sixthSequence = []; seventhSequence = []; eighthSequence = []; finalSequence = [];
for i in range(0,len(firstSequence)):
userid.append(firstSequence[i][0])
for j in range(0,len(firstSequence[i][1])):
partid.append(firstSequence[i][1][j][0])
sessionid.append(firstSequence[i][1][j][1])
duration.append(firstSequence[i][1][j][2])
pid.append(firstSequence[i][1][j][0])
sid.append(firstSequence[i][1][j][1])
did.append(firstSequence[i][1][j][2])
a = tuple((firstSequence[i][1][j][0],firstSequence[i][1][j][2]))
secondSequence.append(a)
r = tuple((firstSequence[i][1][j][0],firstSequence[i][1][j][1],firstSequence[i][1][j][2]))
fifthSequence.append(r)
u = tuple((firstSequence[i][1][j][0],firstSequence[i][1][j][1]))
seventhSequence.append(u)
if i != 0:
del pid[0:len(firstSequence[i-1][1])]
del sid[0:len(firstSequence[i-1][1])]
del did[0:len(firstSequence[i-1][1])]
del secondSequence[0:len(firstSequence[i-1][1])]
del fifthSequence[0:len(firstSequence[i-1][1])]
del seventhSequence[0:len(firstSequence[i-1][1])]
b = tuple(secondSequence)
s = tuple(fifthSequence)
v = tuple(seventhSequence)
x = tuple(pid)
y = tuple(sid)
z = tuple(did)
partid.append(x)
sessionid.append(y)
duration.append(z)
if argument == 'idUser+idPart+duration':
thirdSequence.insert(2*i, firstSequence[i][0])
thirdSequence.append(b)
......@@ -72,6 +77,8 @@ class DataProcessing:
fourthSequence.append(c)
else:
thirdSequence.append(b)
sixthSequence.append(s)
eighthSequence.append(v)
if argument == 'idUser':
finalSequence = userid
......@@ -83,29 +90,118 @@ class DataProcessing:
finalSequence = duration
elif argument == 'idPart+duration':
finalSequence = thirdSequence
elif argument == 'idPart+idSession':
finalSequence = eighthSequence
elif argument == 'idPart+idSession+duration':
return sixthSequence
elif argument == 'idUser+idPart+duration':
finalSequence = fourthSequence
else:
print ("Wrong input type")
if needToSave:
fileName = "filterSequences"+str(idCourse)+".p"
fileName = "filter"+str(kind)+"Sequences"+str(idCourse)+".p"
pickle.dump(finalSequence, open(os.path.join('filtersequences', fileName), "wb"))
return finalSequence
def averageLengthForPart(self, idCourse):
def removeReduplicatePart(self, sequence):
'''
get the average length of part in a course
input a list form as ((idPart,idSession),(idPart,idSession),...,(idPart,idSession))
output a re- organise list without reduplicate idPart in the same idSession and keeps the original order
'''
scriptDatabaseManager = ScriptDatabaseManager()
a = scriptDatabaseManager.getSequencesSucceedPremium(idCourse)
a = list(set(sequence))
b = sorted(a,key=itemgetter(1))
return b
def removeReduplicate(self, sequence):
'''
input a list form as [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
[(idSession, idSession, ... ,idSession),(idSession, idSession, ... ,idSession),...,(idSession, idSession, ... ,idSession)]
[(duration, duration, ... ,duration),(duration, duration, ... ,duration),(),...,(duration, duration, ... ,duration)]
output the list which had removed the reduplication
'''
r = []
for i in range(len(sequence)):
a = list(set(sequence[i]))
b = sorted(a)
c = tuple(b)
r.append(c)
return r
idpart = self.splitSequence(a, 'idPart', False)
iduser = self.splitSequence(a, 'idUser', False)
def getCorrectPSDsequence(self, sequence, kind, courseName, needToSave):
'''
input the sequence which produced from Premium Sequence function
output the sequence without reduplicate idPart and has the correct duration.
The form likes ((idPart,idSession,duration),(idPart,idSession,duration),...,(idPart,idSession,duration))
kind type is Succeed or Failed
courseName is JAVA, XML and so on.
'''
d = self.filterSequence(sequence, 'duration', kind, False)
ps = self.filterSequence(sequence, 'idPart+idSession', kind, False)
ave = float(len(idpart))/float(len(iduser))
return ave
finalSequence = []
for i in range(len(ps)):
ps_ = self.removeReduplicatePart(ps[i])
ps_original = ps[i]
d_ = d[i]
d_final = []
processSequence = []
for j in range(len(ps_)):
sum = 0
for k in range(len(ps_original)):
if ps_original[k] == ps_[j]:
sum = sum + d_[k]
d_final.append(sum)
for j in range(len(ps_)):
a = tuple((ps_[j][0],ps_[j][1],d_final[j]))
processSequence.append(a)
b = tuple(processSequence)
finalSequence.append(b)
if needToSave:
fileName = "finalFilter"+courseName+kind+"Sequences"+".p"
pickle.dump(finalSequence, open(os.path.join('finalFiltersequences', fileName), "wb"))
return finalSequence
def averagePartLengthForACourse(self, sequence, kind, courseName):
'''
gets the average length of session for a course
sequence is got from the Premium Sequence function
kind type is Succeed or Failed
courseName is JAVA, XML and so on.
'''
idPart = self.getCorrectPSDsequence(sequence, kind, courseName, False)
idUser = self.filterSequence(sequence, 'idUser', kind, False)
sum = 0
for i in range(len(idPart)):
sum = sum +len(idPart[i])
print sum
ave = float(sum)/float(len(idUser))
return ave
def averageDurationForPart(self, sequence):
'''
get the average duration of the special part in a special course
sequence is got from the Premium Sequence function
'''
idPart = self.filterSequence(sequence, 'idPart', '_', False)
Mon_idPart = self.removeReduplicate(idPart)
psd = self.getCorrectPSDsequence(sequence, '_', '_', False)
duration = []
for i in range(len(Mon_idPart[0])):
sum = 0; ave = 0
for j in range(len(psd)):
for k in range(len(psd[j])):
if psd[j][k][0] == Mon_idPart[0][i]:
sum = sum + psd[j][k][2]
ave = float(sum)/float(len(idPart))
duration.append(((Mon_idPart[0][i]),(ave)))
return duration
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment