Steps:
- Download 2014-01-27-mnemosynelogs-all.db.xz
- Extract:
tar -xvf 2014-01-27-mnemosynelogs-all.db.xz
- Count the records:
sqlite3 -batch "select event, count(*) From log Group by event;"
Total 121 188 408 Event type 1 813 548 start 2 786 586 stop 3 172 462 scheduler 4 3 123 878 load db 5 1 109 352 save db 6 58 532 022 add card 8 7 684 724 delete card 9 48 965 836 repetition - Dump repetitions to CSV:
sqlite3 -csv -header -batch 2014-01-27-mnemosynelogs-all.db "select object_id,grade,acq_reps,ret_reps,actual_interval From log where event=9 limit 7;"
(this only does the first few lines of course)
c136315a,9779a1ad,1,0,0,5 c136315a,a2a80b21,1,0,0,4 c136315a,a35a6c4a,1,0,0,5 c136315a,85f8ec88,1,0,0,5 c136315a,10ae2adc,1,0,0,5 c136315a,a9c66681,1,0,0,4 c136315a,ba841422,1,0,0,5 c136315a,4e108d46,1,0,0,5
- Sort on time to completion and grade:
db9864c5,37132b6b,6,10,-188957807,3 db9864c5,680c6178,3,7,-188879971,3 db9864c5,40712c82,7,6,-188879963,3 db9864c5,a5679e62,17,14,-188879941,3 db9864c5,21a9326f,14,11,-188652473,3 dQjTu8hqz3d04ReDrWTxdZ,7humHviBhzGDsbmfZzkFNh,2,0,-157680000,3 dQjTu8hqz3d04ReDrWTxdZ,LGuN7QIDz7jsjuwKeSazrB,2,0,-157680000,3 dQjTu8hqz3d04ReDrWTxdZ,a2cgcETy4U1mpR8gId581B,2,0,-157680000,2 dQjTu8hqz3d04ReDrWTxdZ,2dHnVSa6v7HJXd1Bk6oOI7,2,0,-157680000,0 dQjTu8hqz3d04ReDrWTxdZ,evdCglXJdRSwNAJwOaxDJo,2,0,-157680000,0 dQjTu8hqz3d04ReDrWTxdZ,iWboGsLywTzw2agIQPUsBw,3,1,-157680000,0 dQjTu8hqz3d04ReDrWTxdZ,y4hyxaB2HdW6UVQn15vdoh,23,2,-157680000,0 qFHzllJkeSgoONQZJxkJ3c,LWYCS7wgH2T3aHPRUVUnH4,3,0,-127916114,2 qFHzllJkeSgoONQZJxkJ3c,CIbwahRHaTaTCujq1X10QB,3,0,-127916114,0 qFHzllJkeSgoONQZJxkJ3c,QPuSfCkJzbzk7uw2AJEA73,5,0,-127916103,1 qFHzllJkeSgoONQZJxkJ3c,zdpemIGjakRrDMmK7Lvc6d,2,0,-127916095,1 qFHzllJkeSgoONQZJxkJ3c,ILfYUuLjwER4VnpIYtUpWj,6,0,-127916088,0 qFHzllJkeSgoONQZJxkJ3c,GdkgWUvUZMkhbp4QZim6fA,13,0,-127916071,1
- Run this Haskell:
{-# LANGUAGE ScopedTypeVariables #-} module GradeMunge where import qualified Data.ByteString.Lazy as BL import Data.Csv.Streaming import Data.Csv(encode) import System.IO import qualified Data.Map.Strict as M type Data = (Integer,M.Map Integer Integer,(Integer,Integer)) def :: Integer -> Data def r = (0,M.fromList [(0,0),(1,0),(2,0),(3,0),(4,0),(5,0)],(r,r)) main :: IO () main = do csvData <- BL.readFile "grades.csv" let out = munge (decode NoHeader csvData) (def (-188957808)) BL.writeFile "gradehisto.csv" (encode out) munge :: Records (BL.ByteString, BL.ByteString, Integer, Integer, Integer, Integer) -> Data -> [[Integer]] munge (Cons (Right (user,obj,acq_reps,ret_reps,interval,grade)) k) d@(total,_,(low,high)) = if (total < 10000 || (low <= interval && interval <= high && total < 100000)) then -- add the record munge k (insertRecord interval grade d) else -- return the record and start a new one prepOut d : munge k (insertRecord interval grade (def interval)) munge (Cons (Left err) k) a = error ("blah: " ++ err) -- munge k a munge (Nil (Just s) k') a = error ("nil: " ++ s) -- [a] munge (Nil Nothing k') a = [prepOut a] prepOut :: Data -> [Integer] prepOut d@(total,xs,(low,high)) | length (M.keys xs) == 6 = low : high : total : M.elems xs | otherwise = error (show d ++ " keys: " ++ show (M.keys xs)) insertRecord :: Integer -> Integer -> Data -> Data insertRecord interval grade (total,xs,(low,high)) = (total+1,M.insertWith (+) grade 1 xs,(low `min` interval,high `max` interval))
- Plot in Mathematica:
filename: (user_id)_[(machine_id)_](log_number).txt CREATE TABLE parsed_logs(log_name text primary key); # used for incremental processing CREATE TABLE _cards(id text primary key,last_rep int,offset int); # used for version < 2 munging; 'last_rep' is the timestamp of the card's last grading time; offset is 0, 1 (grade>=2 phase 1), or -1 (grade <= 2 phase 2) insert or replace into _cards(id=card_id + user_id, offset, last_rep=) CREATE TABLE log( user_id text, event integer, timestamp integer, object_id text, grade integer, easiness real, acq_reps integer, ret_reps integer, lapses integer, acq_reps_since_lapse integer, ret_reps_since_lapse integer, scheduled_interval integer, actual_interval integer, thinking_time integer, next_rep integer ); # Program Started (program_name_version = Mnemosyne 1.0-RC nt win32) insert into log(user_id, event=STARTED_PROGRAM=1, timestamp, object_id=program_name_version) # Program stopped insert into log(user_id, event=STOPPED_PROGRAM=2, timestamp) # Scheduler SM2 Mnemosyne insert into log(user_id, event=STARTED_SCHEDULER=3, timestamp, object_id=scheduler_name) # Loaded database N N N insert into log(user_id, event=LOADED_DATABASE=4, timestamp, object_id=machine_id, acq_reps=scheduled_count, ret_reps=non_memorised_count, lapses=active_count) # Saved database N N N insert into log(user_id, event=SAVED_DATABASE=5, timestamp, object_id=machine_id, acq_reps=scheduled_count, ret_reps=non_memorised_count, lapses=active_count) # New item id grade new_interval (munged, possibly add repetition too) # Imported item id grade ret_reps last_rep next_rep interval (not munged) insert into log(user_id, event=ADDED_CARD=6, timestamp, object_id=card_id) # Deleted item id insert into log(user_id, event=DELETED_CARD=8, timestamp, object_id=card_id) # R id grade easiness | acq_reps ret_reps lapses acq_reps_since_lapse ret_reps_since_lapse | scheduled_interval actual_interval | new_interval noise | thinking_time # R id grade 2.5 | 1 0 0 1 0 | 0 0 | new_interval 0 | 0 when adding new card and grade >= 2 insert into log(user_id, event=REPETITION=9, timestamp, object_id=card_id, grade, easiness, acq_reps, ret_reps, lapses, acq_reps_since_lapse, ret_reps_since_lapse, scheduled_interval, actual_interval=timestamp-previous_rep_timestamp, thinking_time, next_rep=timestamp + new_interval) initial grading in 'Add Card' counted as an acquisition repetition (and explicitly logged as an 'R' event) when grade 2+, otherwise grade= -1 like imported cards card_id is created as a hash of the card data gr= the grade, 0-5, default is 0, -1 means "unseen" easyiness is the easiness parameter from the SM2 algorithm acquisition reps = # w/ gr<2, including card retention reps = # w/ gr=2+, including card lapses = the number of times you forget this card (new grade < 2, old grade >=2) ac_rp_l, rt_rp_l = ... since lapse timestamp = last_rep = last actual repetition next_rep = next scheduled repetition, timestamp sch_i: scheduled previous interval in seconds act_i: actual previous interval in seconds th_t: thinking time in seconds initial grade 0,1 (failing) initial grade 2,3,4,5 (passing) Version < 0.9.8 (phase 1) acq_reps=0 acq_reps=0, R added, acq++ 0.9.8 <= version < 2.0 (phase 2) acq_reps=1, but all acq– acq_reps=1->0, R added 2.0 <= version acq_reps=0 acq_reps=0, R
by any chance do you have still access to the dataset, or even better a link to the latest one?
ReplyDeleteI looked and I have the "gradehisto.csv" file used to generate the graph, but I don't have the original data. I think I deleted it as laptop HD's were small back then and I was running out of disk space.
Delete