Tuesday, 5 June 2012

Spearman's Rank Correlation Coefficient in F#

I have recently started to learn F#. I decided to write the same program as in the previous post to test what I'd learnt so far, I've commented the program well so there is no real need for me to give additional explanations.
let calcCoeff (setX : int[]) (setY : int[]) = 
    let GetRanks set n =
        
        // Parameters:
        // ranks:    the output list of the ranks, it is infered with type float 
        //           due to the float division in the 2nd statement
        // freq:     the occurances of the same value in the list
        // curRank:  the current rank starting from n increasing to 1
        // acc:      the sum of the repeated values in the set, this value is divided
        //           by freq to calculate the shared rank between more than 1 value
        // set:      the sorted set of data (implicitly passed into the function as
        //           this is used with pattern matching)

        let rec CalculateRanks ranks freq curRank acc = function
            | [] -> List.toArray(ranks)
            | hd :: tl when tl <> [] && tl.Head = hd -> CalculateRanks ranks (freq+1) (curRank - 1) (acc + curRank) tl
            | hd :: tl when freq > 1 -> CalculateRanks ((List.replicate freq ((float(acc + curRank)) / float(freq))) @ ranks) 1 (curRank-1) 0 tl 
            | hd :: tl -> CalculateRanks (float(curRank) :: ranks) 1 (curRank-1) 0 tl
    
        let sorted = Array.sort(set)

        // Create a new array, containing the values and ranks as pairs (2-value tuple)
        // The array returned needs reversing as the cons operator (::) prepends an item to the list
        let ranks = Array.zip (Array.rev(CalculateRanks [] 1 n 0 (Array.toList sorted) )) sorted
        
        // Create a blank array with the same amount of elements
        let orderedRanks = Array.zeroCreate n
        
        // Find the value in the original list by searching for the value
        // (2nd item in the tuple (snd) and take the rank (1st item in the tuple (fst))
        for i in 0 .. n-1 do
            let originalIndex = Array.findIndex (fun x -> snd x = set.[i]) ranks
            orderedRanks.[i] <- (fst ranks.[originalIndex])

        orderedRanks

    if setX.Length <> setY.Length then raise (new System.ArgumentException("Both collections of data must contain an equal number of elements"))   

    let n = setX.Length
    let ranksX = GetRanks setX n
    let ranksY = GetRanks setY n

    let sigmaDiff = (Array.zip ranksX ranksY) |> Array.sumBy (fun x -> (fst x - snd x) ** 2.0)
    1.0 - ((6.0 * sigmaDiff) / ((float(n) ** 3.0) - float(n)))

let main() = 
    let setX = [| 72; 112; 46; 97; 46; 46; 52 |]
    let setY = [| 20; 2; 7; 11; 4; 12; 7 |]

    let rho = calcCoeff setX setY

    printfn "Spearman's Rank Correlation Coefficient: %f" rho

main()
Spearman's Rank Correlation Coefficient: -0.062500

On inspecting the performance of both C# and F# versions (using System.Diagnostics.Stopwatch), F# came in at just a few milliseconds faster than the C# version. Note also that I used type annotation for the 2 array parameters to constrain the array to type int.