Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Antoine PIGEAU
2015-Hubble-UserProfiles
Commits
0bc1ff06
Commit
0bc1ff06
authored
Jul 05, 2016
by
Bo
Browse files
creates the exportSequenceToTXTFile for adapting the frequent mining
software
parent
cadef72a
Changes
1
Hide whitespace changes
Inline
Side-by-side
2015-Hubble-UserProfile/BukaiLinNew/dataProcessing.py
View file @
0bc1ff06
...
...
@@ -7,20 +7,106 @@ Created on 2016��6��3��
from
dataManager.scriptDatabaseManager2
import
ScriptDatabaseManager
from
dataManager.databaseManagerData2
import
DatabaseManager
from
dataManager.scriptDatabaseStatistic
import
ScriptDatabaseStatistic
from
unidecode
import
unidecode
import
math
import
csv
import
os
import
pickle
import
numpy
as
np
from
asyncore
import
read
from
lib2to3.pgen2.tokenize
import
Special
from
pattern.metrics
import
duration
from
symbol
import
argument
from
operator
import
itemgetter
from
math
import
sqrt
databaseManager
=
DatabaseManager
()
scriptDatabaseManager
=
ScriptDatabaseManager
()
class
DataProcessing
:
'''
*******************************************************************************************
Processing function
*******************************************************************************************
'''
def
removeReduplicatePart
(
self
,
sequence
):
'''
input a list form as ((idPart,idSession),(idPart,idSession),...,(idPart,idSession))
output a re- organise list without reduplicate idPart in the same idSession and keeps the original order
'''
a
=
list
(
set
(
sequence
))
a
.
sort
(
key
=
sequence
.
index
)
return
a
def
removeReduplication
(
self
,
sequence
):
'''
input a list form as [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
[(idSession, idSession, ... ,idSession),(idSession, idSession, ... ,idSession),...,(idSession, idSession, ... ,idSession)]
[(duration, duration, ... ,duration),(duration, duration, ... ,duration),(),...,(duration, duration, ... ,duration)]
output the list which had removed the reduplication
'''
r
=
[]
for
i
in
range
(
len
(
sequence
)):
a
=
list
(
set
(
sequence
[
i
]))
b
=
sorted
(
a
)
c
=
tuple
(
b
)
r
.
append
(
c
)
return
r
def
RemoveSession
(
self
,
sequence
):
'''
input a list created from removeReduplicatPart ((idPart,idSession),(idPart,idSession),...,(idPart,idSession)),
remove the idSession attributions in order to satisfy the input type for Frequent Pattern Mining operation.
output form: [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
'''
newSequence
=
self
.
removeReduplicatePart
(
sequence
)
def
f
((
x
,
y
)):
return
x
newSequence
=
map
(
f
,
newSequence
)
return
newSequence
def
GroupSession
(
self
,
sequence
):
'''
input a list created from 'idPart+idSession'
output a list ((idSession,idSession,...,idSession),(idSession,idSession,...,idSession),...,(idSession,idSession,...,idSession))
and the values in a tuple is single, such as ((1,2,3,4,5,6,7),(1,2,3,4,5),..,(1,2,3,4,5,6,7,8,9,10,11,12))
'''
y
=
[]
for
i
in
range
(
len
(
sequence
)):
x
=
[]
for
j
in
range
(
len
(
sequence
[
i
])):
a
=
sequence
[
i
][
j
][
1
]
x
.
append
(
a
)
t
=
tuple
(
x
)
y
.
append
(
t
)
z
=
[]
for
i
in
range
(
len
(
y
)):
a
=
self
.
removeReduplicatePart
(
y
[
i
])
b
=
tuple
(
a
)
z
.
append
(
b
)
return
z
'''
*******************************************************************************************
Generates specific sequences or documents
*******************************************************************************************
'''
def
filterSequence
(
self
,
sequence
,
argument
,
kind
,
needToSave
):
'''
...
...
@@ -104,31 +190,125 @@ class DataProcessing:
pickle
.
dump
(
finalSequence
,
open
(
os
.
path
.
join
(
'filtersequences'
,
fileName
),
"wb"
))
return
finalSequence
def
re
moveReduplicatePart
(
self
,
sequence
):
def
p
re
pareInputDataPForCourse
(
self
,
sequence
,
kind
):
'''
input a list form as ((idPart,idSession),(idPart,idSession),...,(idPart,idSession))
output a re- organise list without reduplicate idPart in the same idSession and keeps the original order
input a sequence from Premium Sequence function
output a new list include all of users in a specific course.
((idPart,idPart,...idPart),(idPart,idPart,...idPart),...,(idPart,idPart,...idPart))
'''
a
=
list
(
set
(
sequence
))
b
=
sorted
(
a
,
key
=
itemgetter
(
1
))
return
b
finalSequence
=
[]
Part_Session
=
self
.
filterSequence
(
sequence
,
'idPart+idSession'
,
kind
,
False
)
for
i
in
range
(
len
(
Part_Session
)):
MonPS
=
self
.
removeReduplicatePart
(
Part_Session
[
i
])
# remove reduplication idPart
MonP
=
self
.
RemoveSession
(
MonPS
)
a
=
tuple
(
MonP
)
finalSequence
.
append
(
a
)
return
finalSequence
def
re
moveReduplicat
e
(
self
,
sequence
):
def
p
re
pareInputDataPSForCours
e
(
self
,
sequence
,
kind
):
'''
input a
list form as [(idPart, idPart, ... ,idPart),(idPart, idPart, ... ,idPart),...,(idPart, idPart, ... ,idPart)]
[(idSession, idSession, ... ,idSession),(idSession, idSession, ... ,idSession),...,(idSession, idSession, ... ,idSession)]
[(duration, durat
ion,
...
,durat
ion),(
duration, duration, ... ,duration),(),...,(duration, duration, ... ,duration)]
output the list which had removed the reduplication
input a
sequence from Premium Sequence function
output a new list include all of users in a specific course.
(((idPart,idSession),(idPart,idSess
ion
)
,...
(idPart,idSess
ion)
)
,(
(idPart,idSession),(idPart,idSession),...(idPart,idSession)),...,
((idPart,idSession),(idPart,idSession),...(idPart,idSession)))
'''
r
=
[]
for
i
in
range
(
len
(
sequence
)):
a
=
list
(
set
(
sequence
[
i
]))
b
=
sorted
(
a
)
c
=
tuple
(
b
)
r
.
append
(
c
)
return
r
finalSequence
=
[]
Part_Session
=
self
.
filterSequence
(
sequence
,
'idPart+idSession'
,
kind
,
False
)
for
i
in
range
(
len
(
Part_Session
)):
MonPS
=
self
.
removeReduplicatePart
(
Part_Session
[
i
])
a
=
tuple
(
MonPS
)
finalSequence
.
append
(
a
)
return
finalSequence
def
InputDataPGroupBySession
(
self
,
sequence
,
kind
):
'''
input a sequence from Premium Sequence function
output a new idPart list group by session id.
((idPart,idPart,...idPart),(idPart,idPart,...idPart),...,(idPart,idPart,...idPart))
'''
a
=
self
.
filterSequence
(
sequence
,
'idPart+idSession'
,
kind
,
False
)
idSession
=
self
.
GroupSession
(
a
)
firstSequence
=
self
.
prepareInputDataPSForCourse
(
sequence
,
kind
)
x
=
[];
secondSequence
=
[];
thirdSequence
=
[]
for
i
in
range
(
len
(
firstSequence
)):
for
n
in
range
(
len
(
idSession
[
i
])):
for
j
in
range
(
len
(
firstSequence
[
i
])):
if
firstSequence
[
i
][
j
][
1
]
==
idSession
[
i
][
n
]:
x
.
append
(
firstSequence
[
i
][
j
][
0
])
y
=
tuple
(
x
)
secondSequence
.
append
(
y
)
x
=
[]
z
=
tuple
(
secondSequence
)
secondSequence
=
[]
thirdSequence
.
append
(
z
)
return
thirdSequence
def
InputDataPSGroupBySession
(
self
,
sequence
,
kind
):
'''
input a sequence from Premium Sequence function
output a new idPart list group by session id.
((idPart,idPart,...idPart,idSession),(idPart,idPart,...idPart,idSession),...,(idPart,idPart,...idPart,idSession))
'''
a
=
self
.
filterSequence
(
sequence
,
'idPart+idSession'
,
kind
,
False
)
idSession
=
self
.
GroupSession
(
a
)
firstSequence
=
self
.
prepareInputDataPSForCourse
(
sequence
,
kind
)
x
=
[];
y
=
[];
z
=
[];
secondSequence
=
[];
thirdSequence
=
[]
for
i
in
range
(
len
(
firstSequence
)):
for
n
in
range
(
len
(
idSession
[
i
])):
for
j
in
range
(
len
(
firstSequence
[
i
])):
if
firstSequence
[
i
][
j
][
1
]
==
idSession
[
i
][
n
]:
x
.
append
(
firstSequence
[
i
][
j
][
0
])
x
.
append
((
n
+
1
))
#x.insert(0, (n+1))
y
=
tuple
(
x
)
secondSequence
.
append
(
y
)
x
=
[]
z
=
tuple
(
secondSequence
)
secondSequence
=
[]
thirdSequence
.
append
(
z
)
return
thirdSequence
def
exportSequenceToTXTFile
(
self
,
sequences
,
fileName
):
'''
export a sequence into a TXT file
format of the TXT file: PartId
'''
file
=
open
(
os
.
path
.
join
(
'TXT'
,
fileName
),
'w'
)
separator
=
' -1 '
for
sequence
in
sequences
:
for
idPart
in
sequence
:
p
=
str
(
idPart
)
if
p
.
find
(
",)"
):
p
=
p
.
replace
(
",)"
,
")"
)
p
=
p
.
lstrip
(
'('
)
p
=
p
.
strip
(
','
)
p
=
p
.
rstrip
(
')'
)
file
.
write
(
str
(
p
)
+
separator
)
file
.
write
(
' -2
\n
'
)
file
.
close
()
'''
*******************************************************************************************
Gets stats for analyzing
*******************************************************************************************
'''
def
getCorrectPSDsequence
(
self
,
sequence
,
kind
,
courseName
,
needToSave
):
'''
...
...
@@ -183,27 +363,42 @@ class DataProcessing:
print
sum
ave
=
float
(
sum
)
/
float
(
len
(
idUser
))
return
ave
def
averageDurationForPart
(
self
,
sequence
):
def
averageDuration
Stats
ForPart
(
self
,
kind
,
sequence
):
'''
get the average duration of the special part in a special course
sequence is got from the Premium Sequence function
'''
idPart
=
self
.
filterSequence
(
sequence
,
'idPart'
,
'_'
,
False
)
Mon_idPart
=
self
.
removeReduplicat
e
(
idPart
)
Mon_idPart
=
self
.
removeReduplicat
ion
(
idPart
)
psd
=
self
.
getCorrectPSDsequence
(
sequence
,
'_'
,
'_'
,
False
)
if
kind
==
'median'
:
median
=
[]
for
i
in
range
(
len
(
Mon_idPart
[
0
])):
sum
=
0
;
ave
=
0
;
n
=
0
for
j
in
range
(
len
(
psd
)):
for
k
in
range
(
len
(
psd
[
j
])):
if
psd
[
j
][
k
][
0
]
==
Mon_idPart
[
0
][
i
]:
sum
=
sum
+
psd
[
j
][
k
][
2
]
n
=
n
+
1
ave
=
float
(
sum
)
/
float
(
n
)
average
=
float
(
'%0.3f'
%
ave
)
median
.
append
(((
Mon_idPart
[
0
][
i
]),(
average
)))
return
median
duration
=
[]
for
i
in
range
(
len
(
Mon_idPart
[
0
]))
:
sum
=
0
;
ave
=
0
;
n
=
0
for
j
in
range
(
len
(
psd
)):
for
k
in
range
(
len
(
psd
[
j
]
)):
if
psd
[
j
][
k
][
0
]
==
Mon_idPart
[
0
][
i
]
:
sum
=
sum
+
psd
[
j
][
k
][
2
]
n
=
n
+
1
ave
=
float
(
sum
)
/
float
(
n
)
average
=
float
(
'%0.3f'
%
ave
)
duration
.
append
(((
Mon_idPart
[
0
][
i
]),(
average
)))
stdev
=
[]
if
kind
==
'stdev'
:
for
i
in
range
(
len
(
Mon_idPart
[
0
])):
a
=
[];
b
=
[];
for
j
in
range
(
len
(
psd
)):
for
k
in
range
(
len
(
psd
[
j
]))
:
if
psd
[
j
][
k
][
0
]
==
Mon_idPart
[
0
][
i
]:
a
.
append
(
psd
[
j
][
k
][
2
])
b
.
append
(((
Mon_idPart
[
0
][
i
]),
float
(
'%0.3f'
%
sqrt
(
np
.
var
(
a
))))
)
stdev
.
append
(
b
)
return
stdev
return
duration
\ No newline at end of file
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment