
Saturday, July 23, 2011

Decision trees with scala-recog: a machine learning library in Scala

scala-recog is a project I created on Google code, a library of machine learning algorithms written in Scala. A feature is the implementation of ID3 algorithm: it allows to build a decision tree based on a training set of known elements.

Example: classifying curricula for a job site

In a job site you need to classify curricula to show to the logged users and you can extract a huge quantity of data, but let you extract some of them:
  • if the employee has some certifications
  • if the guy is talkative
  • (s)he joined a golf club
  • (s)he earned a master degree
and put all in a class:
  case class Person(
                val hasCertifications : Boolean,
                val isTalkative : Boolean,
                val golfClub : Boolean,
                val hasMasterDegree : Boolean,
                val job : String

  val persons = Person(hasCertifications = true, isTalkative = false, 
                       golfClub = false, hasMasterDegree = true, 
                       job = "Programmer") ::
                Person(hasCertifications = false, isTalkative = false, 
                       golfClub = false, hasMasterDegree = true, 
                       job = "Junior Programmer") ::
                Person(hasCertifications = true, isTalkative = false, 
                       golfClub = false, hasMasterDegree = false, 
                       job = "Programmer") ::
                Person(hasCertifications = false, isTalkative = true, 
                       golfClub = false, hasMasterDegree = true, 
                       job = "Seller") ::
                Person(hasCertifications = false, isTalkative = true, 
                       golfClub = false, hasMasterDegree = false, 
                       job = "Seller") ::
                Person(hasCertifications = true, isTalkative = true, 
                       golfClub = false, hasMasterDegree = false, 
                       job = "Seller") ::
                Person(hasCertifications = false, isTalkative = true, 
                       golfClub = true, hasMasterDegree = true, 
                       job = "CEO") ::
                Person(hasCertifications = false, isTalkative = false, 
                       golfClub = true, hasMasterDegree = false, 
                       job = "CEO") ::
                Person(hasCertifications = false, isTalkative = false, 
                       golfClub = true, hasMasterDegree = false, 
                       job = "CEO") ::
In the list persons I put my training set. To use the ID3 algorithm, just import the right object and use the list for training:
import org.scalarecog.decisiontree._

def toVector(p : Person) = Vector(p.hasCertifications, p.isTalkative, p.golfClub, p.hasMasterDegree)
val dataset = persons map (p => (toVector(p), p.job))

val tree = new ID3[Boolean,String] buildTree dataset
The reason for toVector is that the ID3 class needs a Vector.
Now tree can classify a person:

val newPerson = Person(false, false, false, false, "?")
   tree.classify(toVector(newPerson)) == "Junior Programmer"

But it would be fine to see the decision tree created by ID3. With JGraph it's straightforward, and i get this:
Hey, it's the real life! ^_^
Here is the full code:

package scalarecoggraph

import org.scalarecog.decisiontree._
import javax.swing.JFrame
import com.mxgraph.swing.mxGraphComponent
import com.mxgraph.view.mxGraph

class Program(tree : DecisionTree[Vector[Boolean], String], propertyNames : Vector[String]) extends JFrame("ScalaRecog") {
  type Tree = DecisionTree[Vector[Boolean], String]
  type Vertex = (AnyRef, (Double, Double))


  def draw() {
    val graph: mxGraph = new mxGraph
    val root = graph.getDefaultParent

    def draw(t : Tree, parentPos : (Double, Double), offset : (Int, Int)) : Vertex = {
      def createVertex(label : String, action : Vertex => Unit = v => {}) : Vertex = {
        val vertexSize = (100, 30)
        val newPos = (parentPos._1 + offset._1, parentPos._2 + offset._2)
        val created = (graph.insertVertex(root, null, label, newPos._1, newPos._2 , vertexSize._1, vertexSize._2), newPos)
      def createEdge(label : String, from : Vertex, to : Vertex) = graph.insertEdge(root, null, label, from._1, to._1)

      t match {
        case a : DecisionLeaf[Vector[Boolean],String] => createVertex(a.label)
        case a : DecisionBranchVector[String,Boolean] =>
          createVertex(propertyNames(a.index), n => {
            for (  ((label, child), index) <- a.branches.zipWithIndex  )
              createEdge(label.toString, n, draw(child, n._2, (120*index, offset._2)))

    try {
      draw(tree, (0, 0), (120, 120))
    finally {
    getContentPane.add(new mxGraphComponent(graph))

object Program  {

  case class Person(
                val hasCertifications : Boolean,
                val isTalkative : Boolean,
                val golfClub : Boolean,
                val hasMasterDegree : Boolean,
                val job : String

  def main(args : Array[String]) : Unit = {

    val persons = Person(hasCertifications = true, isTalkative = false, golfClub = false, hasMasterDegree = true, job = "Programmer") ::
                  Person(hasCertifications = false, isTalkative = false, golfClub = false, hasMasterDegree = true, job = "Junior Programmer") ::
                  Person(hasCertifications = true, isTalkative = false, golfClub = false, hasMasterDegree = false, job = "Programmer") ::
                  Person(hasCertifications = false, isTalkative = true, golfClub = false, hasMasterDegree = true, job = "Seller") ::
                  Person(hasCertifications = false, isTalkative = true, golfClub = false, hasMasterDegree = false, job = "Seller") ::
                  Person(hasCertifications = true, isTalkative = true, golfClub = false, hasMasterDegree = false, job = "Seller") ::
                  Person(hasCertifications = false, isTalkative = true, golfClub = true, hasMasterDegree = true, job = "CEO") ::
                  Person(hasCertifications = false, isTalkative = false, golfClub = true, hasMasterDegree = false, job = "CEO") ::
                  Person(hasCertifications = false, isTalkative = false, golfClub = true, hasMasterDegree = false, job = "CEO") ::

    def toVector(p : Person) = Vector(p.hasCertifications, p.isTalkative, p.golfClub, p.hasMasterDegree)
    val dataset = persons map (p => (toVector(p), p.job))

    val tree = new ID3[Boolean,String] buildTree dataset

    val newPerson = Person(false, false, false, false, "?")
      tree.classify(toVector(newPerson)) == "Junior Programmer"

    val frame = new Program(tree, Vector("Has certifications?", "Is talkative?", "Likes playing golf?", "Has a master degree?"))
    frame.setSize(400, 320)
