Wednesday, May 20, 2015

Mnemosyne data analysis

I did this a while ago.

Steps:

  1. Download 2014-01-27-mnemosynelogs-all.db.xz
  2. Extract: tar -xvf 2014-01-27-mnemosynelogs-all.db.xz
  3. Count the records: sqlite3 -batch "select event, count(*) From log Group by event;"

    Total121 188 408Event type
    1 813 548start
    2 786 586stop
    3 172 462scheduler
    4 3 123 878load db
    5 1 109 352save db
    658 532 022add card
    8 7 684 724delete card
    948 965 836repetition
  4. Dump repetitions to CSV: sqlite3 -csv -header -batch 2014-01-27-mnemosynelogs-all.db "select object_id,grade,acq_reps,ret_reps,actual_interval From log where event=9 limit 7;" (this only does the first few lines of course)

    c136315a,9779a1ad,1,0,0,5
    c136315a,a2a80b21,1,0,0,4
    c136315a,a35a6c4a,1,0,0,5
    c136315a,85f8ec88,1,0,0,5
    c136315a,10ae2adc,1,0,0,5
    c136315a,a9c66681,1,0,0,4
    c136315a,ba841422,1,0,0,5
    c136315a,4e108d46,1,0,0,5
    
  5. Sort on time to completion and grade:
    db9864c5,37132b6b,6,10,-188957807,3
    db9864c5,680c6178,3,7,-188879971,3
    db9864c5,40712c82,7,6,-188879963,3
    db9864c5,a5679e62,17,14,-188879941,3
    db9864c5,21a9326f,14,11,-188652473,3
    dQjTu8hqz3d04ReDrWTxdZ,7humHviBhzGDsbmfZzkFNh,2,0,-157680000,3
    dQjTu8hqz3d04ReDrWTxdZ,LGuN7QIDz7jsjuwKeSazrB,2,0,-157680000,3
    dQjTu8hqz3d04ReDrWTxdZ,a2cgcETy4U1mpR8gId581B,2,0,-157680000,2
    dQjTu8hqz3d04ReDrWTxdZ,2dHnVSa6v7HJXd1Bk6oOI7,2,0,-157680000,0
    dQjTu8hqz3d04ReDrWTxdZ,evdCglXJdRSwNAJwOaxDJo,2,0,-157680000,0
    dQjTu8hqz3d04ReDrWTxdZ,iWboGsLywTzw2agIQPUsBw,3,1,-157680000,0
    dQjTu8hqz3d04ReDrWTxdZ,y4hyxaB2HdW6UVQn15vdoh,23,2,-157680000,0
    qFHzllJkeSgoONQZJxkJ3c,LWYCS7wgH2T3aHPRUVUnH4,3,0,-127916114,2
    qFHzllJkeSgoONQZJxkJ3c,CIbwahRHaTaTCujq1X10QB,3,0,-127916114,0
    qFHzllJkeSgoONQZJxkJ3c,QPuSfCkJzbzk7uw2AJEA73,5,0,-127916103,1
    qFHzllJkeSgoONQZJxkJ3c,zdpemIGjakRrDMmK7Lvc6d,2,0,-127916095,1
    qFHzllJkeSgoONQZJxkJ3c,ILfYUuLjwER4VnpIYtUpWj,6,0,-127916088,0
    qFHzllJkeSgoONQZJxkJ3c,GdkgWUvUZMkhbp4QZim6fA,13,0,-127916071,1
    
  6. Run this Haskell:
    {-# LANGUAGE ScopedTypeVariables #-}
    
    module GradeMunge where
    
    import qualified Data.ByteString.Lazy as BL
    import Data.Csv.Streaming
    import Data.Csv(encode)
    import System.IO
    import qualified Data.Map.Strict as M
    
    type Data = (Integer,M.Map Integer Integer,(Integer,Integer))
    
    def :: Integer -> Data
    def r = (0,M.fromList [(0,0),(1,0),(2,0),(3,0),(4,0),(5,0)],(r,r))
    
    main :: IO ()
    main = do
        csvData <- BL.readFile "grades.csv"
        let out = munge (decode NoHeader csvData) (def (-188957808))
        BL.writeFile "gradehisto.csv" (encode out)
    
    munge :: Records (BL.ByteString, BL.ByteString, Integer, Integer, Integer, Integer) -> Data -> [[Integer]]
    munge (Cons (Right (user,obj,acq_reps,ret_reps,interval,grade)) k) d@(total,_,(low,high)) =
        if (total < 10000 || (low <= interval && interval <= high && total < 100000)) then
            -- add the record
            munge k (insertRecord interval grade d)
        else
            -- return the record and start a new one
            prepOut d : munge k (insertRecord interval grade (def interval))
    munge (Cons (Left err) k) a = error ("blah: " ++ err) -- munge k a
    munge (Nil (Just s) k') a = error ("nil: " ++ s) -- [a]
    munge (Nil Nothing k') a = [prepOut a]
    
    prepOut :: Data -> [Integer]
    prepOut d@(total,xs,(low,high)) | length (M.keys xs) == 6 = low : high : total : M.elems xs
                                  | otherwise = error (show d ++ " keys: " ++ show (M.keys xs))
    
    insertRecord :: Integer -> Integer -> Data -> Data
    insertRecord interval grade (total,xs,(low,high)) = (total+1,M.insertWith (+) grade 1 xs,(low `min` interval,high `max` interval))
    
  7. Plot in Mathematica:
And my notes on the database format:
filename: (user_id)_[(machine_id)_](log_number).txt
CREATE TABLE parsed_logs(log_name text primary key); # used for incremental processing
CREATE TABLE _cards(id text primary key,last_rep int,offset int); # used for version < 2 munging; 'last_rep' is the timestamp of the card's last grading time; offset is 0, 1 (grade>=2 phase 1), or -1 (grade <= 2 phase 2)
insert or replace into _cards(id=card_id + user_id, offset, last_rep=)

CREATE TABLE log(
        user_id text,
        event integer,
        timestamp integer,
        object_id text,
        grade integer,
        easiness real,
        acq_reps integer,
        ret_reps integer,
        lapses integer,
        acq_reps_since_lapse integer,
        ret_reps_since_lapse integer,
        scheduled_interval integer,
        actual_interval integer,
        thinking_time integer,
        next_rep integer
    );
# Program Started (program_name_version = Mnemosyne 1.0-RC nt win32)
insert into log(user_id, event=STARTED_PROGRAM=1, timestamp, object_id=program_name_version)
# Program stopped
insert into log(user_id, event=STOPPED_PROGRAM=2, timestamp)
# Scheduler SM2 Mnemosyne
insert into log(user_id, event=STARTED_SCHEDULER=3, timestamp, object_id=scheduler_name)
# Loaded database N N N
insert into log(user_id, event=LOADED_DATABASE=4, timestamp, object_id=machine_id, acq_reps=scheduled_count, ret_reps=non_memorised_count, lapses=active_count)
# Saved database N N N
insert into log(user_id, event=SAVED_DATABASE=5, timestamp, object_id=machine_id, acq_reps=scheduled_count, ret_reps=non_memorised_count, lapses=active_count)
# New item id grade new_interval (munged, possibly add repetition too)
# Imported item id grade ret_reps last_rep next_rep interval (not munged)
insert into log(user_id, event=ADDED_CARD=6, timestamp, object_id=card_id)
# Deleted item id
insert into log(user_id, event=DELETED_CARD=8, timestamp, object_id=card_id)
# R id grade easiness | acq_reps ret_reps lapses acq_reps_since_lapse ret_reps_since_lapse | scheduled_interval actual_interval | new_interval noise | thinking_time
# R id grade 2.5 | 1 0 0 1 0 | 0 0 | new_interval 0 | 0 when adding new card and grade >= 2
insert into log(user_id, event=REPETITION=9, timestamp, object_id=card_id, grade, easiness, acq_reps, ret_reps, lapses, acq_reps_since_lapse, ret_reps_since_lapse, scheduled_interval, actual_interval=timestamp-previous_rep_timestamp, thinking_time, next_rep=timestamp + new_interval)

initial grading in 'Add Card' counted as an acquisition repetition (and explicitly logged as an 'R' event) when grade 2+, otherwise grade= -1 like imported cards

card_id is created as a hash of the card data
gr= the grade, 0-5, default is 0, -1 means "unseen"
easyiness is the easiness parameter from the SM2 algorithm
acquisition reps = # w/ gr<2, including card
retention reps = # w/ gr=2+, including card
lapses = the number of times you forget this card (new grade < 2, old grade >=2)
ac_rp_l, rt_rp_l = ... since lapse
timestamp = last_rep = last actual repetition
next_rep = next scheduled repetition, timestamp
sch_i: scheduled previous interval in seconds
act_i: actual previous interval in seconds
th_t: thinking time in seconds
initial grade 0,1 (failing)   initial grade 2,3,4,5 (passing)
Version < 0.9.8 (phase 1) acq_reps=0    acq_reps=0, R added, acq++
0.9.8 <= version < 2.0 (phase 2) acq_reps=1, but all acq–    acq_reps=1->0, R added
2.0 <= version acq_reps=0    acq_reps=0, R

2 comments:

  1. by any chance do you have still access to the dataset, or even better a link to the latest one?

    ReplyDelete
    Replies
    1. I looked and I have the "gradehisto.csv" file used to generate the graph, but I don't have the original data. I think I deleted it as laptop HD's were small back then and I was running out of disk space.

      Delete