Commit 0bc1ff06 authored by Bo's avatar Bo
Browse files

creates the exportSequenceToTXTFile for adapting the frequent mining

software
parent cadef72a
......@@ -7,20 +7,106 @@ Created on 2016��6��3��
from dataManager.scriptDatabaseManager2 import ScriptDatabaseManager
from dataManager.databaseManagerData2 import DatabaseManager
from dataManager.scriptDatabaseStatistic import ScriptDatabaseStatistic
from unidecode import unidecode
import math
import csv
import os
import pickle
import numpy as np
from asyncore import read
from lib2to3.pgen2.tokenize import Special
from pattern.metrics import duration
from symbol import argument
from operator import itemgetter
from math import sqrt
databaseManager = DatabaseManager()
scriptDatabaseManager = ScriptDatabaseManager()
class DataProcessing:
'''
*******************************************************************************************
Processing function
*******************************************************************************************
'''
def removeReduplicatePart(self, sequence):
'''
input a list form as ((idPart,idSession),(idPart,idSession),...,(idPart,idSession))
output a re- organise list without reduplicate idPart in the same idSession and keeps the original order
'''
a = list(set(sequence))
a.sort(key=sequence.index)
return a
def removeReduplication(self, sequence):
'''
input a list form as [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
[(idSession, idSession, ... ,idSession),(idSession, idSession, ... ,idSession),...,(idSession, idSession, ... ,idSession)]
[(duration, duration, ... ,duration),(duration, duration, ... ,duration),(),...,(duration, duration, ... ,duration)]
output the list which had removed the reduplication
'''
r = []
for i in range(len(sequence)):
a = list(set(sequence[i]))
b = sorted(a)
c = tuple(b)
r.append(c)
return r
def RemoveSession(self, sequence):
'''
input a list created from removeReduplicatPart ((idPart,idSession),(idPart,idSession),...,(idPart,idSession)),
remove the idSession attributions in order to satisfy the input type for Frequent Pattern Mining operation.
output form: [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
'''
newSequence = self.removeReduplicatePart(sequence)
def f((x,y)): return x
newSequence = map(f, newSequence)
return newSequence
def GroupSession(self, sequence):
'''
input a list created from 'idPart+idSession'
output a list ((idSession,idSession,...,idSession),(idSession,idSession,...,idSession),...,(idSession,idSession,...,idSession))
and the values in a tuple is single, such as ((1,2,3,4,5,6,7),(1,2,3,4,5),..,(1,2,3,4,5,6,7,8,9,10,11,12))
'''
y = []
for i in range(len(sequence)):
x = []
for j in range(len(sequence[i])):
a = sequence[i][j][1]
x.append(a)
t = tuple(x)
y.append(t)
z = []
for i in range(len(y)):
a = self.removeReduplicatePart(y[i])
b = tuple(a)
z.append(b)
return z
'''
*******************************************************************************************
Generates specific sequences or documents
*******************************************************************************************
'''
def filterSequence(self, sequence, argument, kind, needToSave):
'''
......@@ -104,31 +190,125 @@ class DataProcessing:
pickle.dump(finalSequence, open(os.path.join('filtersequences', fileName), "wb"))
return finalSequence
def removeReduplicatePart(self, sequence):
def prepareInputDataPForCourse(self, sequence, kind):
'''
input a list form as ((idPart,idSession),(idPart,idSession),...,(idPart,idSession))
output a re- organise list without reduplicate idPart in the same idSession and keeps the original order
input a sequence from Premium Sequence function
output a new list include all of users in a specific course.
((idPart,idPart,...idPart),(idPart,idPart,...idPart),...,(idPart,idPart,...idPart))
'''
a = list(set(sequence))
b = sorted(a,key=itemgetter(1))
return b
finalSequence = []
Part_Session = self.filterSequence(sequence, 'idPart+idSession', kind, False)
for i in range(len(Part_Session)):
MonPS = self.removeReduplicatePart(Part_Session[i]) # remove reduplication idPart
MonP = self.RemoveSession(MonPS)
a = tuple(MonP)
finalSequence.append(a)
return finalSequence
def removeReduplicate(self, sequence):
def prepareInputDataPSForCourse(self, sequence, kind):
'''
input a list form as [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
[(idSession, idSession, ... ,idSession),(idSession, idSession, ... ,idSession),...,(idSession, idSession, ... ,idSession)]
[(duration, duration, ... ,duration),(duration, duration, ... ,duration),(),...,(duration, duration, ... ,duration)]
output the list which had removed the reduplication
input a sequence from Premium Sequence function
output a new list include all of users in a specific course.
(((idPart,idSession),(idPart,idSession),...(idPart,idSession)),((idPart,idSession),(idPart,idSession),...(idPart,idSession)),...,
((idPart,idSession),(idPart,idSession),...(idPart,idSession)))
'''
r = []
for i in range(len(sequence)):
a = list(set(sequence[i]))
b = sorted(a)
c = tuple(b)
r.append(c)
return r
finalSequence = []
Part_Session = self.filterSequence(sequence, 'idPart+idSession', kind, False)
for i in range(len(Part_Session)):
MonPS = self.removeReduplicatePart(Part_Session[i])
a = tuple(MonPS)
finalSequence.append(a)
return finalSequence
def InputDataPGroupBySession(self, sequence, kind):
'''
input a sequence from Premium Sequence function
output a new idPart list group by session id.
((idPart,idPart,...idPart),(idPart,idPart,...idPart),...,(idPart,idPart,...idPart))
'''
a = self.filterSequence(sequence, 'idPart+idSession', kind, False)
idSession = self.GroupSession(a)
firstSequence = self.prepareInputDataPSForCourse(sequence, kind)
x = []; secondSequence = []; thirdSequence = []
for i in range(len(firstSequence)):
for n in range(len(idSession[i])):
for j in range(len(firstSequence[i])):
if firstSequence[i][j][1] == idSession[i][n]:
x.append(firstSequence[i][j][0])
y = tuple(x)
secondSequence.append(y)
x = []
z = tuple(secondSequence)
secondSequence = []
thirdSequence.append(z)
return thirdSequence
def InputDataPSGroupBySession(self, sequence, kind):
'''
input a sequence from Premium Sequence function
output a new idPart list group by session id.
((idPart,idPart,...idPart,idSession),(idPart,idPart,...idPart,idSession),...,(idPart,idPart,...idPart,idSession))
'''
a = self.filterSequence(sequence, 'idPart+idSession', kind, False)
idSession = self.GroupSession(a)
firstSequence = self.prepareInputDataPSForCourse(sequence, kind)
x = []; y = []; z = []; secondSequence = []; thirdSequence = []
for i in range(len(firstSequence)):
for n in range(len(idSession[i])):
for j in range(len(firstSequence[i])):
if firstSequence[i][j][1] == idSession[i][n]:
x.append(firstSequence[i][j][0])
x.append((n+1))
#x.insert(0, (n+1))
y = tuple(x)
secondSequence.append(y)
x = []
z = tuple(secondSequence)
secondSequence = []
thirdSequence.append(z)
return thirdSequence
def exportSequenceToTXTFile(self, sequences, fileName):
'''
export a sequence into a TXT file
format of the TXT file: PartId
'''
file = open(os.path.join('TXT', fileName), 'w')
separator = ' -1 '
for sequence in sequences:
for idPart in sequence:
p = str(idPart)
if p.find(",)"):
p = p.replace(",)", ")")
p = p.lstrip('(')
p = p.strip(',')
p = p.rstrip(')')
file.write(str(p)+separator)
file.write(' -2\n')
file.close()
'''
*******************************************************************************************
Gets stats for analyzing
*******************************************************************************************
'''
def getCorrectPSDsequence(self, sequence, kind, courseName, needToSave):
'''
......@@ -183,27 +363,42 @@ class DataProcessing:
print sum
ave = float(sum)/float(len(idUser))
return ave
def averageDurationForPart(self, sequence):
def averageDurationStatsForPart(self, kind, sequence):
'''
get the average duration of the special part in a special course
sequence is got from the Premium Sequence function
'''
idPart = self.filterSequence(sequence, 'idPart', '_', False)
Mon_idPart = self.removeReduplicate(idPart)
Mon_idPart = self.removeReduplication(idPart)
psd = self.getCorrectPSDsequence(sequence, '_', '_', False)
if kind == 'median':
median = []
for i in range(len(Mon_idPart[0])):
sum = 0; ave = 0; n = 0
for j in range(len(psd)):
for k in range(len(psd[j])):
if psd[j][k][0] == Mon_idPart[0][i]:
sum = sum + psd[j][k][2]
n = n + 1
ave = float(sum)/float(n)
average = float('%0.3f'%ave)
median.append(((Mon_idPart[0][i]),(average)))
return median
duration = []
for i in range(len(Mon_idPart[0])):
sum = 0; ave = 0; n = 0
for j in range(len(psd)):
for k in range(len(psd[j])):
if psd[j][k][0] == Mon_idPart[0][i]:
sum = sum + psd[j][k][2]
n = n + 1
ave = float(sum)/float(n)
average = float('%0.3f'%ave)
duration.append(((Mon_idPart[0][i]),(average)))
stdev = []
if kind == 'stdev':
for i in range(len(Mon_idPart[0])):
a = []; b = [];
for j in range(len(psd)):
for k in range(len(psd[j])):
if psd[j][k][0] == Mon_idPart[0][i]:
a.append(psd[j][k][2])
b.append(((Mon_idPart[0][i]),float('%0.3f'%sqrt(np.var(a)))))
stdev.append(b)
return stdev
return duration
\ No newline at end of file
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment