Code from Expedia Hotel Recommendations

using System;
using System.Collections.Generic;
using System.Linq;
using Boltzman.Core.Utilities;
using Boltzman.Core.Utilities.Csv;
using Boltzman.Tests;
using NUnit.Framework;

namespace Boltzman.ExpediaHotelRecommendations
{
    //Getting leaked info 23.434s
    //Total: 0.2288588(1976304), Specific: 0.8881389(509261)
    [TestFixture]
    public class ConstantDistanceFinder : TestFixtureBase
    {
        public static int MaxRows = 600 * 1000;
        //public static int MaxRows = int.MaxValue;
        public float LimitForClose = 230;
        public float LimitForFar = 30;
        public int MinimumHits = 1;
        public float Epsilon = 0.03f;
        public float MatchEpsilon = 0.031f;

        // Total: 0.01394861(600000), Specific: 0.6348937(13182), Count: 6159 => 0.51051
        // Total: 0.02265029(600000), Specific: 0.5906204(23010), Count: 14449 => 0.51772
        // Total: 0.03969542(600000), Specific: 0.4976858(47856), Count: 121121 => 0.52171
            
        // Total: 0.03984083(600000), Specific: 0.4974508(48054), Count: 124630
        public int HitsFound;
        public static bool UseLeak = false;

        public static string CitiesWithConstantDistancesFileName = "CitiesWithConstantDistances.txt";
        public static string SubmissionFileName = "WideLeak.csv";
        public static string HotelsAndDistancesFileName = "hotels_and_distances.csv";

        private readonly List<string> _hits = new List<string>();

        [Test]
        public void Double()
        {
            FindCitiesWithConstantDistanceDifferences();
            Verify();
        }

        [Test]
        public void FullDoubleWithSubmissionAndMix()
        {
            /*
            HitsFound:7709628
            Verify
            MatchEpsilon: 0.01
            Loaded 11186881 train
            Loaded 1976304 validation
            Getting leaked info 23.645s
            Loading cities searches=24145292, cities=10283, cityDictionary=10283
            LoadConstantDistances 7709633 loaded
            Computing map...
            Total: 0.3378158(1976304), Specific: 0.7528934(886748), Count: 7709633
            */

            UseLeak = true;
            MaxRows = int.MaxValue;
            FindCitiesWithConstantDistanceDifferences();
            Verify();
            BuildSubmission();
            MixIntoSubmission();
        }

        [Test]
        public void MixIntoSubmission()
        {
            new SubmissionMixer().Mix("WideLeak.csv", "aggregator 0.50566.csv", "WideLeakAggregator.csv");
        }

        [Test]
        public void BuildSubmission()
        {
            UseLeak = true;
            MaxRows = int.MaxValue;
            Console.WriteLine("BuildSubmission");
            List<_2_PreprocessSearches.Search> train = _2_PreprocessSearches.LoadBinTrainSearches().ToList();
            Console.WriteLine($"Loaded {train.Count} train");
            List<_2_PreprocessSearches.Search> test = _2_PreprocessSearches.LoadBinTestSearches().ToList();
            Console.WriteLine($"Loaded {test.Count} test");
            _2_PreprocessSearches.GetLeak(train, test);
            Dictionary<HashableObjectList, City> cityDictionary = LoadCities();
            LoadConstantDistances(cityDictionary);

            //List<List<byte>> selectedClusters = new List<List<byte>>();

            List<List<byte>> selectedClusters =
                test.ParallelSelect(search =>
                {
                    var labels = GetWideLeakLabels(cityDictionary, search);
                    _2_PreprocessSearches.AddLeakToLabelList(search, labels);
                    return labels.Select(x => (byte)x).ToList();
                });
            //foreach (_2_PreprocessSearches.Search search in test)
            //{
            //    var labels = GetWideLeakLabels(cityDictionary, search);
            //    _2_PreprocessSearches.AddLeakToLabelList(search, labels);
            //    selectedClusters.Add(labels.Select(x => (byte)x).ToList());
            //}

            _6_Aggregator2.SaveSubmission(selectedClusters, test, SubmissionFileName);
        }

        [Test]
        public void IdentifyHotels()
        {
            // For each city, join up hotels given the distances computed 
            Console.WriteLine("IdentifyHotels");
            Dictionary<HashableObjectList, City> cityDictionary = LoadCities();
            LoadConstantDistances(cityDictionary);

            Dictionary<Distance, Hotel> distanceHotelDictionary = new Dictionary<Distance, Hotel>();
            HashSet<Hotel> hotels = new HashSet<Hotel>();
            foreach (City firstCity in cityDictionary.Values)
            {
                foreach (ConstantDistanceCitySearchDestination constantDistanceCitySearchDestination in firstCity.ConstantDistanceCitySearchDestinations)
                {
                    City secondCity = constantDistanceCitySearchDestination.City;

                    // Now step along both arrays to figure out which cities line up
                    CitySearchDestination firstCityDestinations = firstCity.SearchDestinations.SafeGetValue(constantDistanceCitySearchDestination.srch_destination_id);
                    CitySearchDestination secondCityDestinations = secondCity.SearchDestinations.SafeGetValue(constantDistanceCitySearchDestination.srch_destination_id);

                    if (firstCityDestinations == null || secondCityDestinations == null)
                    {
                        continue;
                    }

                    int fi = 0;
                    int si = 0;
                    while (true)
                    {
                        if (fi >= firstCityDestinations.Distances.Count) { break; }
                        if (si >= secondCityDestinations.Distances.Count) { break; }

                        var first = firstCityDestinations.Distances[fi];
                        var second = secondCityDestinations.Distances[si];

                        double delta = (first.orig_destination_distance + constantDistanceCitySearchDestination.Delta - second.orig_destination_distance);
                        if (Math.Abs(delta) < MatchEpsilon && first.WinningCluster == second.WinningCluster)
                        {
                            var firstHotel = distanceHotelDictionary.SafeGetValue(first);
                            var secondHotel = distanceHotelDictionary.SafeGetValue(second);

                            if (firstHotel == null && secondHotel == null)
                            {
                                Hotel hotel = new Hotel
                                {
                                    srch_destination_id = constantDistanceCitySearchDestination.srch_destination_id,
                                    HotelCluster = first.WinningCluster
                                };
                                hotel.Distances.Add(first);
                                hotel.Distances.Add(second);
                                distanceHotelDictionary.Add(first, hotel);
                                distanceHotelDictionary.Add(second, hotel);
                                hotels.Add(hotel);
                            }
                            else if (firstHotel != null && secondHotel != null)
                            {
                                if (firstHotel != secondHotel)
                                {
                                    firstHotel.Distances.UnionWith(secondHotel.Distances);
                                    secondHotel.Distances.ForEach(x => distanceHotelDictionary[x] = firstHotel);
                                    hotels.Remove(secondHotel);
                                }
                            }
                            else
                            {
                                firstHotel?.Distances.Add(second);
                                secondHotel?.Distances.Add(first);
                            }

                            //Console.WriteLine($"{firstCity},{secondCity},{constantDistanceCitySearchDestination.srch_destination_id},{constantDistanceCitySearchDestination.Delta},{first.orig_destination_distance}, {second.orig_destination_distance}");
                            fi++;
                            si++;
                        }

                        else if (Math.Abs(delta) < MatchEpsilon)
                        {
                            if (fi < firstCityDestinations.Distances.Count - 1 && firstCityDestinations.Distances[fi].WinningCluster == second.WinningCluster)
                            {
                                // Try to step into a match
                                fi++;
                            }
                            else
                            {
                                // Try to step into a match
                                si++;
                            }
                        }
                        else
                        {
                            if (delta < 0) { fi++; } else { si++; }
                        }
                    }
                }
            }

            Console.WriteLine($"Initially created {hotels.Count} hotels");
            while (MergeHotels(hotels))
            {
                Console.WriteLine($"Merged down to {hotels.Count} hotels");
            }
            Console.WriteLine($"Ended up with {hotels.Count} hotels with {hotels.Sum(h => h.Distances.Count)} distances");
            int id = 0;
            hotels.ForEach(x => x.Id = id++);
            hotels.Select(x => x.ToString()).WriteAllLines(Settings.BasePath + HotelsAndDistancesFileName);
        }

        private bool MergeHotels(HashSet<Hotel> hotels)
        {
            bool thereWereMergers = false;
            Dictionary<Distance, Hotel> dictionary = new Dictionary<Distance, Hotel>();

            var list = hotels.ToList();
            for (int i = 0; i < list.Count - 1; i++)
            {
                var currentHotel = list[i];
                foreach (Distance distance in currentHotel.Distances.ToList())
                {
                    Hotel testHotel = dictionary.SafeGetValue(distance);
                    if (testHotel == null || testHotel == currentHotel)
                    {
                        continue;
                    }
                    currentHotel.Distances.UnionWith(testHotel.Distances);
                    hotels.Remove(testHotel);
                    thereWereMergers = true;
                }
                currentHotel.Distances.ForEach(x => dictionary[x] = currentHotel);
            }
            return thereWereMergers;
        }

        public class Hotel
        {
            public int srch_destination_id { get; set; }
            public int Id { get; set; }
            public int HotelCluster { get; set; }
            public HashSet<Distance> Distances { get; set; } = new HashSet<Distance>();
            public override string ToString()
            {
                return
                    $"{srch_destination_id}," +
                    $"{Id}," +
                    $"{HotelCluster}," +
                    $"{Distances.Select(x => $"{x.City}@{x.orig_destination_distance}").SJoin("|")}";
            }
        }

        [Test]
        public void Verify()
        {
            Console.WriteLine("Verify");
            Console.WriteLine($"MatchEpsilon: {MatchEpsilon}");
            List<_2_PreprocessSearches.Search> train = _2_PreprocessSearches.LoadTrainTraining().Take(MaxRows).ToList();
            Console.WriteLine($"Loaded {train.Count} train");
            List<_2_PreprocessSearches.Search> validation = _2_PreprocessSearches.LoadTrainValidation().Take(MaxRows).ToList();
            Console.WriteLine($"Loaded {validation.Count} validation");
            if (UseLeak)
            {
                _2_PreprocessSearches.GetLeak(train, validation);
            }

            Dictionary<HashableObjectList, City> cityDictionary = LoadCities();
            LoadConstantDistances(cityDictionary);

            Accumulator total = new Accumulator();
            Accumulator specific = new Accumulator();
            Console.WriteLine("Computing map...");
            validation.ParallelForEach(search =>
            {
                if (search.orig_destination_distance <= 0)
                {
                    total.LockedAddValue(0);
                    return;
                }

                List<int> labels = GetWideLeakLabels(cityDictionary, search);
                _2_PreprocessSearches.AddLeakToLabelList(search, labels);

                float precision = AveragePrecision.OneOf5ComputeAveragePrecision(search.hotel_cluster.Value, labels);
                if (labels.Any())
                {
                    specific.LockedAddValue(precision);
                }
                total.LockedAddValue(precision);
            });
            Console.WriteLine($"Total: {total}, Specific: {specific}, Count: {cityDictionary.Values.Sum(x => x.ConstantDistanceCitySearchDestinations.Count)}");
        }

        private List<int> GetWideLeakLabels(Dictionary<HashableObjectList, City> cityDictionary, _2_PreprocessSearches.Search search)
        {
            List<int> labels = new List<int>();

            // Look deeper!
            City city = cityDictionary.SafeGetValue(City.GetCountryRegionCityHash(search.user_location_country, search.user_location_region, search.user_location_city));
            List<ConstantDistanceCitySearchDestination> others =
                city?.ConstantDistanceCitySearchDestinations.Where(x => x.srch_destination_id == GetSearchDestinationId(search)).ToList();

            if (city != null && others != null)
            {
                // Are there any hits for the other city that are appropriate?
                InstanceCounter<int> counter = new InstanceCounter<int>();
                others.ForEach(constantDistanceCitySearchDestination =>
                {
                    CitySearchDestination destination =
                        constantDistanceCitySearchDestination.City?.SearchDestinations.SafeGetValue(GetSearchDestinationId(search));

                    if (destination != null)
                    {
                        double pos = search.orig_destination_distance + constantDistanceCitySearchDestination.Delta;
                        List<Distance> hits =
                            destination
                                .Distances
                                .Where(x => Math.Abs(x.orig_destination_distance - pos) < MatchEpsilon)
                                .ToList();

                        if (hits.Any())
                        {
                            //Console.WriteLine("Found matches");
                            lock (counter)
                            {
                                counter.AddCounts(hits, h => h.WinningCluster);
                            }
                        }
                    }
                });
                if (counter.Count > 0)
                {
                    //Console.WriteLine("Found matches");
                    labels.AddRange(counter.GetSortedCounts().Select(x => x.Value).Distinct().Take(5));
                }
            }
            return labels;
        }

        private int GetSearchDestinationId(_2_PreprocessSearches.Search search)
        {
            //return search.srch_destination_id_base;
            return search.srch_destination_id;
        }

        private Dictionary<HashableObjectList, City> LoadCities()
        {
            Console.Write("Loading cities");
            List<_2_PreprocessSearches.Search> searches = _2_PreprocessSearches.LoadBinTrainSearches().Where(x => x.orig_destination_distance > 0).Take(MaxRows).ToList();
            Console.Write($" searches={searches.Count}");

            List<City> cities = ExtractCities(searches);
            Console.Write($", cities={cities.Count}");

            Dictionary<HashableObjectList, City> cityDictionary = cities.ToDictionary(x => x.CountryRegionCityHash, x => x);
            Console.WriteLine($", cityDictionary={cityDictionary.Count}");

            return cityDictionary;
        }

        private static void LoadConstantDistances(Dictionary<HashableObjectList, City> cityDictionary)
        {
            Console.Write("LoadConstantDistances");
            foreach (CsvRow csvRow in CsvFile.LoadRows(Settings.BaseSvnPath + CitiesWithConstantDistancesFileName).Take(MaxRows))
            {
                //  0  1    2   3   4   5   6     7
                // 66,442,3602,66,442,44774,383,-18.5861
                // "{destination1.City.user_location_country},{destination1.City.user_location_region},{destination1.City.user_location_city},{destination2.City.user_location_city},{destination2.srch_destination_id},{hitOnDelta:0.####}"

                /*  $"{destination1.City.posa_continent},{destination1.City.user_location_country},{destination1.City.user_location_region},{destination1.City.user_location_city}," +
                    $"{destination2.City.posa_continent},{destination2.City.user_location_country},{destination2.City.user_location_region},{destination2.City.user_location_city}," +
                    $"{destination2.srch_destination_id},{delta:0.####},{error:0.####}");;*/

                var fromCity = cityDictionary.SafeGetValue(City.GetCountryRegionCityHash(Convert.ToByte(csvRow[0]), Convert.ToInt16(csvRow[1]), Convert.ToUInt16(csvRow[2])));
                var toCity = cityDictionary.SafeGetValue(City.GetCountryRegionCityHash(Convert.ToByte(csvRow[3]), Convert.ToInt16(csvRow[4]), Convert.ToUInt16(csvRow[5])));
                if (fromCity != null && toCity != null)
                {
                    fromCity.ConstantDistanceCitySearchDestinations
                        .Add(
                            new ConstantDistanceCitySearchDestination
                            {
                                srch_destination_id = Convert.ToInt32(csvRow[6]),
                                Delta = Convert.ToDouble(csvRow[7]),
                                City = toCity
                            });
                }
            }

            Console.WriteLine($" {cityDictionary.Values.Sum(x => x.ConstantDistanceCitySearchDestinations.Count)} loaded");
        }

        [Test]
        public void FindCitiesWithConstantDistanceDifferences()
        {
            Console.WriteLine("FindCitiesWithConstantDistanceDifferences");
            List<_2_PreprocessSearches.Search> searches = _2_PreprocessSearches.LoadBinTrainSearches().Where(x => x.orig_destination_distance > 0).Take(MaxRows).ToList();
            Console.WriteLine($"searches:{searches.Count}");
            List<City> cities = ExtractCities(searches);
            Console.WriteLine($"cities:{cities.Count}");
            HitsFound = 0;
            //ShowCities(cities);

            TimeLeftAproximator aproximator = new TimeLeftAproximator(cities.Count);
            ILookup<HashableObjectList, City> proximityGrouped = cities.ToLookup(c => c.ProximityGroupingHash);
            cities.ParallelForEach(primaryCity =>
            {
                //if (MaxRows == int.MaxValue)
                //{
                //    Console.WriteLine($"Processing {primaryCity}");
                //}
                InstanceCounter<int> matchCount = new InstanceCounter<int>();
                foreach (City secondaryCity in proximityGrouped[primaryCity.ProximityGroupingHash])
                {
                    if (primaryCity == secondaryCity)
                    {
                        continue;
                    }

                    if (!primaryCity.CloseSearchDestinations.Overlaps(secondaryCity.CloseSearchDestinations))
                    {
                        continue;
                    }

                    if (!primaryCity.FarSearchDestinations.Overlaps(secondaryCity.FarSearchDestinations))
                    {
                        continue;
                    }

                    List<int> farSearchDestinations = primaryCity.FarSearchDestinations.Intersect(secondaryCity.FarSearchDestinations).ToList();
                    //Console.WriteLine($"{primaryCity} possible match for {secondaryCity} on {farSearchDestinations.Count()} destinations");
                    foreach (int farSearchDestination in farSearchDestinations)
                    {

                        CitySearchDestination primaryDestination = primaryCity.SearchDestinations[farSearchDestination];
                        CitySearchDestination secondaryDestination = secondaryCity.SearchDestinations[farSearchDestination];

                        if (primaryDestination.AverageDistance - secondaryDestination.AverageDistance > LimitForClose * 2/*
                            || primaryDestination.Distances.Count > secondaryDestination.Distances.Count * MaxDistanceDifferenceFactor
                            || secondaryDestination.Distances.Count > primaryDestination.Distances.Count * MaxDistanceDifferenceFactor*/)
                        {
                            //Console.WriteLine("Skipped - too far apart");
                            break;
                        }

                        //Console.WriteLine($"  {farSearchDestination}:");
                        if (FindMatch(primaryDestination, secondaryDestination))
                        {
                            matchCount.AddCount1(farSearchDestination);
                        }

                        //Console.WriteLine($"  primary  : {primaryDestination.Distances.Select(x => x.HotelClusterCounter.GetWinner().Value).SJoin()}");
                        //Console.WriteLine($"  secondary: {secondaryDestination.Distances.Select(x => x.HotelClusterCounter.GetWinner().Value).SJoin()}");
                    }
                }
                if (MaxRows == int.MaxValue)
                {
                    lock (this)
                    {
                        aproximator.IterationFinished();
                        if (aproximator.ItemsLeft % 50 == 0)
                        {
                            Console.WriteLine($"{100 - 100f * aproximator.ItemsLeft / cities.Count:0}% ({aproximator.ItemsLeft}), {aproximator.MinutesLeft:0.0}m");
                        }
                    }
                }
            });
            _hits.WriteAllLines(Settings.BaseSvnPath + CitiesWithConstantDistancesFileName);
            Console.WriteLine($"\nHitsFound:{HitsFound}");
        }

        private bool FindMatch(CitySearchDestination destination1, CitySearchDestination destination2)
        {
            //primary:    80, 80, 80, 80, 80, 80, 80,                                        27
            //secondary:          80, 80, 80, 80, 80, 3, 12, 51, 51, 12, 10, 61, 50, 76, 57, 27
            List<Distance> list = destination1.Distances.ToList();
            for (int i = 0; i < list.Count - 1; i++)
            {
                Distance firstSource = list[i];
                int firstCluster = firstSource.WinningCluster;
                List<double> firstHitDistances = destination2.ClusterDistances.SafeGetValue(firstCluster);
                if (firstHitDistances == null || !firstHitDistances.Any())
                {
                    continue;
                }

                int hits = 0;
                double previousHitOnDelta = 0;
                List<double> potentialJumps = firstHitDistances.Select(firstHitDistance => firstHitDistance - firstSource.orig_destination_distance).ToList();
                for (int j = i + 1; j < list.Count; j++)
                {
                    Distance secondSource = list[j];
                    int secondCluster = secondSource.WinningCluster;
                    List<double> secondHitDistances = destination2.ClusterDistances.SafeGetValue(secondCluster);

                    if (secondHitDistances == null || !secondHitDistances.Any())
                    {
                        continue;
                    }

                    foreach (double secondHitDistance in secondHitDistances)
                    {
                        double hitOnDelta =
                            potentialJumps
                                .FirstOrDefault(delta => Math.Abs(secondHitDistance - secondSource.orig_destination_distance - delta) < Epsilon);

                        if (Math.Abs(hitOnDelta) < float.Epsilon)
                        {
                            continue;
                        }
                        if (MinimumHits == 1)
                        {
                            previousHitOnDelta = hitOnDelta;
                        }
                        if (Math.Abs(previousHitOnDelta) > float.Epsilon && Math.Abs(previousHitOnDelta - hitOnDelta) > Epsilon)
                        {
                            continue;
                        }
                        hits++;
                        if (hits >= MinimumHits)
                        {
                            double delta = (hitOnDelta + previousHitOnDelta) * 0.5;
                            double error = Math.Abs(hitOnDelta - delta) + Math.Abs(previousHitOnDelta - delta);

                            lock (_hits)
                            {
                                _hits.Add(
                                    $"{destination1.City.user_location_country},{destination1.City.user_location_region},{destination1.City.user_location_city}," +
                                    $"{destination2.City.user_location_country},{destination2.City.user_location_region},{destination2.City.user_location_city}," +
                                    $"{destination2.srch_destination_id},{delta:0.####},{error:0.####}");

                                //if (_hits.Count % 50000 == 0)
                                //{
                                //    _hits.WriteAllLines(Settings.BaseSvnPath + CitiesWithConstantDistancesFileName);
                                //}
                            }
                            HitsFound++;
                            return true;
                        }
                        previousHitOnDelta = hitOnDelta;
                    }
                }
            }

            return false;
        }

        private static void ShowCities(List<City> cities)
        {
            Console.WriteLine($"Found {cities.Count} cities");
            foreach (City city in cities.Take(10))
            {
                Console.WriteLine($"city={city}");
                Console.WriteLine($"  close: {city.CloseSearchDestinations.Take(10).SJoin()}");
                Console.WriteLine($"  far: {city.FarSearchDestinations.Take(10).SJoin()}");

                foreach (CitySearchDestination citySearchDestination in city.SearchDestinations.Values.Take(10))
                {
                    Console.WriteLine($"  dest={citySearchDestination.srch_destination_id}, avg.dist={citySearchDestination.AverageDistance}");

                    foreach (Distance distance in citySearchDestination.Distances.Take(10))
                    {
                        Console.WriteLine(
                            $"    x={distance.orig_destination_distance}, clust={distance.HotelClusterCounter.GetSortedCounts().Take(2).Select(x => x.Value).SJoin()}");
                    }
                }
            }
        }

        private List<City> ExtractCities(List<_2_PreprocessSearches.Search> searches)
        {
            List<City> cities = new List<City>();
            searches.GroupBy(x => new { x.user_location_city, x.user_location_region, x.user_location_country })
                .ParallelForEach(
                grouping =>
                {
                    _2_PreprocessSearches.Search first = grouping.First();
                    City city = new City(first.posa_continent, first.user_location_country, first.user_location_region, first.user_location_city);

                    foreach (IGrouping<int, _2_PreprocessSearches.Search> grouping2 in grouping.GroupBy(GetSearchDestinationId))
                    {
                        CitySearchDestination destination = new CitySearchDestination
                        {
                            srch_destination_id = GetSearchDestinationId(grouping2.First()),
                            City = city
                        };

                        foreach (IGrouping<double, _2_PreprocessSearches.Search> searches1 in grouping2.GroupBy(x => x.orig_destination_distance))
                        {
                            var sfirst = searches1.First();
                            Distance distance = new Distance(city) { orig_destination_distance = sfirst.orig_destination_distance };
                            distance.HotelClusterCounter.AddCounts(searches1, s => s.hotel_cluster.Value);
                            destination.Distances.Add(distance);
                        }
                        destination.Distances.SortBy(x => x.orig_destination_distance);
                        destination.AverageDistance = destination.Distances.Average(x => x.orig_destination_distance);
                        destination.SearchDestinationSize = destination.Distances.Max(x => x.orig_destination_distance) -
                                                            destination.Distances.Min(x => x.orig_destination_distance);

                        //if (destination.Distances.Count < MinDistances)
                        //{
                        //    break;
                        //}

                        if (destination.AverageDistance < LimitForClose)
                        {
                            city.CloseSearchDestinations.Add(destination.srch_destination_id);
                        }

                        if (destination.AverageDistance > LimitForFar)
                        {
                            city.FarSearchDestinations.Add(destination.srch_destination_id);
                            city.SearchDestinations.Add(destination.srch_destination_id, destination);
                            destination.CreateClusterDistances();
                        }
                    }

                    if (city.CloseSearchDestinations.Any() && city.FarSearchDestinations.Any())
                    {
                        cities.LockedAdd(city);
                    }
                });
            return cities;
        }

        public class City
        {
            public City(byte posa_continent, byte user_location_country, short user_location_region, ushort user_location_city)
            {
                this.posa_continent = posa_continent;
                this.user_location_country = user_location_country;
                this.user_location_region = user_location_region;
                this.user_location_city = user_location_city;
                //ProximityGroupingHash = new HashableObjectList(posa_continent);
                ProximityGroupingHash = new HashableObjectList(user_location_country);
                //ProximityGroupingHash = new HashableObjectList(user_location_country, user_location_region);
                CountryRegionCityHash = GetCountryRegionCityHash(user_location_country, user_location_region, user_location_city);
            }

            public HashableObjectList ProximityGroupingHash { get; set; }
            public HashableObjectList CountryRegionCityHash { get; set; }

            public Dictionary<int, CitySearchDestination> SearchDestinations { get; set; } = new Dictionary<int, CitySearchDestination>();
            public byte user_location_country { get; set; }
            public short user_location_region { get; set; }
            public ushort user_location_city { get; set; }
            public byte posa_continent { get; set; }
            public HashSet<int> CloseSearchDestinations { get; set; } = new HashSet<int>();
            public HashSet<int> FarSearchDestinations { get; set; } = new HashSet<int>();

            public List<ConstantDistanceCitySearchDestination> ConstantDistanceCitySearchDestinations = new List<ConstantDistanceCitySearchDestination>();

            public override string ToString()
            {
                return $"{user_location_country}/{user_location_region}/{user_location_city}";
            }

            public static HashableObjectList GetCountryRegionCityHash(byte user_location_country, short user_location_region, ushort user_location_city)
            {
                return new HashableObjectList(
                    user_location_country,
                    user_location_region,
                    user_location_city);
            }
        }

        public class ConstantDistanceCitySearchDestination
        {
            public int srch_destination_id { get; set; }
            public double Delta { get; set; }
            public City City { get; set; }
        }

        public class CitySearchDestination
        {
            public int srch_destination_id { get; set; }
            public List<Distance> Distances { get; set; } = new List<Distance>();
            public double AverageDistance { get; set; }
            public double SearchDestinationSize { get; set; }
            public Dictionary<int, List<double>> ClusterDistances { get; set; } = new Dictionary<int, List<double>>();
            public City City { get; set; }

            public void CreateClusterDistances()
            {
                foreach (Distance distance in Distances)
                {
                    foreach (var unsortedCount in distance.HotelClusterCounter.GetUnsortedCounts())
                    {
                        ClusterDistances.GetOrCreate(unsortedCount.Value, x => new List<double>()).Add(distance.orig_destination_distance);
                    }

                    distance.WinningCluster = distance.HotelClusterCounter.GetWinner().Value;
                }
            }
        }

        public class Distance
        {
            public Distance(City city)
            {
                City = city;
            }

            public City City { get; set; }
            public double orig_destination_distance { get; set; }
            public InstanceCounter<int> HotelClusterCounter { get; set; } = new InstanceCounter<int>();
            public int WinningCluster { get; set; }
            public override string ToString()
            {
                return $"{WinningCluster}@{orig_destination_distance}";
            }
        }

        public HashableObjectList CreateKey(_2_PreprocessSearches.Search search)
        {
            return
                HashableObjectList.Create(search, s => s.user_location_city, s => s.srch_destination_id);
        }
    }
}

Evolving a better (?) solution

This post relates to the Expedia Hotel Recommendations that I’m currently participating in on Kaggle. It’s extremely specific to this competition, if any of the techniques I use prove interesting, I’ll come up with something more general some time in the future. See discussions on this post.

If you’re looking for an executive summary of some machine learning strategies, then this post won’t punch your ticket.

I thought I’d pay back those who’ve been generous with information and scripts by outlining what I’m currently trying. It won’t probably be useful to you, but it may get some ideas going! I’m using c# and all my stuff is scratch-built and not open source. Sorry.

The goal of this competition is to predict which hotel category (out of 100) a user booked given historical data and information about the search that the user did. The current go-to solution for this competition seems to be a counting solution. You basically count the users that booked during a similar search (say destination country and same user location city). To suggest the best hotel category, you pick the most common hotel category for a given combination of features.

So: you combine a number of features (out of more than 22) and pick whichever hotel category is most common for that combination of features.

But this doesn’t give you all the hotel clusters you need. One that does really well may only give you a small percentage of the votes you need. So add the hotel clusters from another combination of features until you’ve a hotel cluster for each user.

Time is precious and it’s slipping away

There are several difficulties with this,

  • Loading the data to iterate takes a long time
  • It takes quite some time to evaluate a single given combination of features
  • Combining several combinations of features takes even longer, and ut uses massive amounts of RAM. Some of my combinations combined 20+ combinations of several features.
  • Figuring out which features to combine and keeping track of how well they did is really painful.

Loading the data

I don’t know about you, but the first thing I do in a Kaggle is to try to  make iterations quicker. And for this Kaggle, I converted the csv files to binary files. This made a world of difference. I have one file with my training data, one with my validation data and one with my testing data. Loading these files takes less than one minute, which cuts down on iteration time immensely.

Computing the hotel clusters of one feature combination

I code in c# and the Dictionary implementation is plenty fast. There isn’t much I can do to improve that. And the RAM requirements while building the dictionary; there isn’t much I can do about that. But there’s a trade off between storage and CPU, and that’s pre-computing! So what I do is I pre-compute the hotel-clusters of any given combination of features and simply store it to disk. If I ever change which hotel-clusters a given combination would generate, then I have to delete my files, but that’s rare. Pre-computing about 300 combinations of features take a couple of hours with my C# code. But once they’re there, they’re quite fast to load from disk!

Here’s a picture of a files of hotel clusters;

Pre-generated

As you can see, they’re quite compact, the largest ones (9M) are *tiny* compared to the CSV submissions you send to Kaggle (50M).

And here’s a snippet from the program that generates the pre-computed files;

orig_destination_distance, user_location_city    0.0558    0.8704    32139    0.1032    25.3    0.8704
srch_destination_id, user_id, user_location_city    0.0008    0.3629    1126    0.0509    11.9    0.3629

Now, if it seems like there would be a huge number of feature combinations, you’re right. It’s astronomical. So you have to limit the maximum number of features you’re going to combine (in my case I’m working with 4 at the moment) and which features to include (I’m working with 10 at the moment). This leaves me with 385 unique combinations. That’s doable. Add one more feature? Well, that number grows larger, we’re talking about factorials here.

Combining several combinations of features

To combine several features you traditionally would keep the dictionaries/counts in RAM, which quickly becomes prohibitive. But there’s no reason to do that, once you’ve determined which hotel clusters a particular combination of features would suggest given it’s counts – the counts are no longer useful and can be discarded. So to combine two feature combinations (say [hotel_country] and [srch_destination_id, user_id, user_location_city]) you simply concatenate add the hotel clusters together until you have one per user. This becomes very fast. Which is required for the next step…

Evolving a list of feature combinations

How do we figure out which feature combinations to use? And in which order? And each feature combination can return a total of 5 hotel cluster suggestions, should we use all?

Did it seem like there would be a lot of feature combinations to manage (turned out to be 385 when limits were added)? Well, that holds doubly^10 true here. If you allow 15 different feature combinations, picked from 385 different variants, that number comes out to gazillions. Puts the fac in factorial. Exchaustive searching goes right out the window. Anyone has the right to their own opinion, but no their own fact(orials)s.

Clever clever

A clever few of you are thinking “yeah, but once you’ve used a combination, you wouldn’t use it again”. Well wrong, soo wrong. In an effort to give me even more options to work with, I’m only adding the *first* hotel cluster from any given feature combination. Using the same hotel combination once more will add the second and so on. Simply put; I add the first suggested hotel cluster that doesn’t already exist for that user.

He used to do surgery for girls in the eighties, but gravity always wins out

So how to figure which frigging combinations to use? It’s hardly be possible to even enumerate the possible ones! Well, evolution to the rescue! What I did was create a list evolver that uses trivial Genetic Algorithm techniques. It literally took me less than two hours to implement. You’ll find the actual code below.

It first creates 200 randomly generated cluster combination “lists”. The best ones are propagated into the next generation, where they get a chance to create some mutated versions of themselves. This goes one for quite a few generations – hopefully while better and better solutions come up. My best solution, at the moment, is at 0.50561, and that was created using the techniques described.

Combining techniques

My list evolver from the previous paragraph uses whatever solution gives the best results for the validation period. This means that other solutions can be worked into the process, even if they were generated by some other means than by the counting technique. Just as long as they generate appropriate results for the validation period. So in the end I need results for the validation period and the testing period. But that’s easy enough to generate. At that point, I can add XGBoost results (terrible!) with Dictionary results (pretty good!) and clustering techniques (remains to be seen!).

I don’t get it, how does evolution chose?

I’ve been using evolution (specifically Genetic Algorithms (GA), Genetic Programming (GP) and evolved neural networks)  since about 1996 for different projects. GA isn’t an optimal solution to most (if any) problems. One problem is that it isn’t greedy which means you can’t pick a nice trajectory and follow it. It’s inherently stochastic, which makes it slower than a greedy or numerical solutions would be. But you can trivially use it anywhere even though you don’t know of any greedy or numerical solution.

If you can measure which solution is better out of two different solutions (let them duel it out!), or how good a solution is in relation to all possible solutions (a score), you can make GA work. You just make more mutated copies of the currently best solution and keeping the best ones as you go along. In thgis specific case, measure how well the solution would do in the competition. Keep the best ones, discard the worse ones.

A trivial example

Here’s a trivial example demonstrating how my list evolver work. A list of no more than 6 item must be filled by the number -1 to 10. Below you see the lists it comes up with during a couple of generations.

1: score=35/70*: 10, 9, 7, 1, 8
2: score=45/90*: 10, 10, 9, 7, 1, 8
3: score=51/102*: 10, 10, 9, 7, 7, 8
4: score=53/106*: 10, 10, 10, 9, 7, 7
5: score=56/112*: 10, 10, 10, 10, 9, 7
6: score=59/118*: 10, 10, 10, 10, 10, 9
7: score=60/120*: 10, 10, 10, 10, 10, 10

The solution is so obvious given the premises and it takes milliseconds to find. But finding a good combination of feature combinations to use in this kaggle competition? Takes about a day. And there’s no guarantee that it’s an optimal solution.

Feel free to ask questions, I’ll do my best to answer. During the competition questions are best addressed to where I announce this post in the forums.

Code for to the list evolver class

using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;

namespace Boltzman.Core.Utilities
{
public class ListEvolver<TListItem>
{
public ListEvolver(
Func<Genotype, float> getFastScore,
Func<Genotype, float> getFullScore,
params TListItem[] possibleItems)
{
GetFastScore = getFastScore;
GetFullScore = getFullScore;
PossibleItems = possibleItems.ToList();
PopulationSize = 100;
BestGenotypeFastScoreCutoff = 0.8f;
ParsimonyCost = 0;
InitialSize = 5;
TriedItems = new HashSet<OrderHashedList<TListItem>>();
}

public float BestGenotypeFastScoreCutoff { get; set; }
public int GenesInFullMutate { get; set; }
public int PopulationSize { get; set; }
public int InitialSize { get; set; }
public List<Genotype> Population { get; set; }
public float ParsimonyCost { get; set; }
public Func<Genotype, float> GetFastScore { get; set; }
public Func<Genotype, float> GetFullScore { get; set; }
public List<TListItem> PossibleItems { get; set; }
public HashSet<OrderHashedList<TListItem>> TriedItems { get; set; }
public Genotype BestGenotype { get; set; }
public Action<bool> AfterEpochAction { get; set; }
public int Epoch { get; private set; }
public bool AllowMultipleEntriesOfSameItem { get; set; }
public int MaxLength { get; set; }

public Genotype Evaluate(List<TListItem> items)
{
if (MaxLength > 0 && items.Count() > MaxLength)
{
items = items.Take(MaxLength).ToList();
}
OrderHashedList<TListItem> orderHashedList = new OrderHashedList<TListItem>(items);

lock (TriedItems)
{
if (TriedItems.Contains(orderHashedList))
{
return null;
}
}

Genotype genotype = new Genotype(items);
genotype.FastScore = GetFastScore(genotype);
if (BestGenotype == null || genotype.FastScore > BestGenotype.FastScore * BestGenotypeFastScoreCutoff)
{
if (GetFullScore == null)
{
genotype.FullScore = genotype.FastScore;
}
else
{
genotype.FullScore = GetFullScore(genotype);
}
}

lock (TriedItems)
{
if (TriedItems.Contains(orderHashedList))
{
return null;
}
TriedItems.Add(orderHashedList);

if (BestGenotype == null)
{
BestGenotype = genotype;
}
else if (genotype.FullScore.HasValue)
{
float actualScore = ComputeScore(genotype);
float bestActualScore = ComputeScore(BestGenotype);

if (actualScore > bestActualScore)
{
BestGenotype = genotype;
}
return genotype;
}
}
return null;
}

public void RunEpoch()
{
Epoch++;

if (Population == null)
{
BuildInitialPopulation();
AfterEpochAction(true);
return;
}

List<Genotype> oldPopulation = Population.ToList();
Population.Clear();
Genotype previousBestGenotype = BestGenotype;

List<Func<List<TListItem>, List<TListItem>>> actions =
new List<Func<List<TListItem>, List<TListItem>>>
{
x=>MutateFull(InitialSize),
MutateDelete,
MutateReplace,
MutateRepositionSmall,
MutateRepositionLarge,
MutateInsert
};

if (AllowMultipleEntriesOfSameItem)
{
actions.Add(MutateSplit);
}

if (BestGenotype != null)
{
Population.Add(BestGenotype);
}

while (Population.Count < oldPopulation.Count * 0.1f)
{
Population.Add(oldPopulation.RandomItems(10).Top(x => ComputeScore(x)));
}

Parallel.For(Population.Count – 1, PopulationSize, itemId =>
{
Genotype parent = oldPopulation.RandomItems(10).Top(x => ComputeScore(x));
Genotype genotype = Evaluate(actions.RandomItem()(parent.List));
if (genotype != null)
{
Population.LockedAdd(genotype);
}
});

AfterEpochAction?.Invoke(previousBestGenotype != BestGenotype);
}

private void BuildInitialPopulation()
{
Population = new List<Genotype>(PopulationSize);
Parallel.For(
0,
PopulationSize, i =>
{
lock (Population)
{
if (Population.Count == PopulationSize)
{
return;
}
}

List<TListItem> list = MutateFull(InitialSize);
Genotype genotype = Evaluate(list);
if (genotype != null)
{
Population.LockedAdd(genotype);
}
});
}

private List<TListItem> MutateReplace(List<TListItem> list)
{
list = list.ToList();
for (int i = 0; i < 5; i++)
{
TListItem item = PossibleItems.RandomItem();
if (!list.Contains(item) || AllowMultipleEntriesOfSameItem)
{
list[ThreadSafeRandom.GetNextInt(list.Count)] = item;
return list;
}
}

return list;
}

private List<TListItem> MutateSplit(List<TListItem> list)
{
list = list.ToList();
int pos = ThreadSafeRandom.GetNextInt(list.Count);
list.Insert(pos, list[pos]);
return list;
}

private List<TListItem> MutateDelete(List<TListItem> list)
{
if (list.Count > 0)
{
list = list.ToList();
list.RemoveAt(ThreadSafeRandom.GetNextInt(list.Count));
}
return list;
}

private List<TListItem> MutateRepositionLarge(List<TListItem> list)
{
list = list.ToList();
int pos = ThreadSafeRandom.GetNextInt(list.Count);
TListItem moved = list[pos];
list.RemoveAt(pos);
list.Insert(ThreadSafeRandom.GetNextInt(list.Count), moved);
return list;
}

private List<TListItem> MutateRepositionSmall(List<TListItem> list)
{
list = list.ToList();
if (list.Count < 2)
{
return list;
}
int pos = ThreadSafeRandom.GetNextInt(list.Count);
TListItem item = list[pos];
int otherPos = pos + (ThreadSafeRandom.GetNextBool() ? -1 : 1);
if (otherPos <= 0)
{
otherPos = pos + 1;
}
if (otherPos >= list.Count)
{
otherPos = pos – 1;
}
list[pos] = list[otherPos];
list[otherPos] = item;
return list;
}

private List<TListItem> MutateInsert(List<TListItem> list)
{
list = list.ToList();
for (int i = 0; i < 5; i++)
{
TListItem item = PossibleItems.RandomItem();
if (!list.Contains(item) || AllowMultipleEntriesOfSameItem)
{
list.Insert(ThreadSafeRandom.GetNextInt(list.Count + 1), item);
return list;
}
}

return list;
}

public List<TListItem> MutateFull(int count)
{
List<TListItem> items = new List<TListItem>();
for (int i = 0; i < count; i++)
{
TListItem item = PossibleItems.RandomItem();
if (!items.Contains(item))
{
items.Add(item);
}
}
return items;
}

private float ComputeScore(Genotype genotype)
{
return genotype.FullScore.Value – genotype.List.Count * ParsimonyCost;
}

public class Genotype
{
public Genotype(List<TListItem> list)
{
List = list;
}

public List<TListItem> List { get; set; }
public float FastScore { get; set; }
public float? FullScore { get; set; }
}
}
}

Maskininlärning och Kaggle-Swe

[See this in English]

Kaggle är en sajt som hostar olika tävlingar inom maskininlärning. Det här blogginlägget kommer beskriva den första av de kaggletävlingar som jag deltagit i, vad jag gjorde och hur resultatet blev.

Kaggle beskriver sig själva som

The world’s largest community of data scientists compete to solve your most valuable problems

AXA Driver Telematics Analysis

AXA Driver Telematics Analysis gick ut på att “Använda telematisk data för att identifiera en förarsignatur”. Kort sagt så¨fick man tillgång till en enorm mängd bilturer (beskrivna med GPS koordinater) som utförts av olika förare. Olika bidrag bedömdes på hur väl man lyckades identifiera vilken förare som utfört vilken biltur.

I tävlingen deltog 1528 deltagare och jag lyckades, efter mycket arbete, komma på plats 17. Av de 16 bidrag som kom före mig så lämnades 7 in av team och resten av enskilda individer. Jag valde att utveckla all min kod själv, från scratch, vilket kan verka opraktiskt, men det är så jag lär mig. Hitta en cool algoritm och sedan implementera den själv.

Anonymisering av data

Målsättningen för AXA, som betalade ut $30,000 i vinstpengar (plus pengar till Kaggle för att administrera tävlingen), var att försöka identifiera när “fel” förare framför ett fordon. Om det t.ex. blivit stulet eller utlånat.

Kaggle/AXA hade blandat upp körningar från en given förare med slumpvis valda körningar från någon annan förare. Målet var att identifiera de körningar som inte tillhörde huvudföraren. Detta innebar att dessa körningar utfördes i en annan stad än de körningar som den egentliga föraren utfört. Så för att hitta missmatchande körningar så räckte det att hitta körningar som kom från ett annat område och peka ut dessa som de felande körningarna.

Kaggle/AXA hade förutspått detta, så de gjorde om alla GPS koordinater till relativa koordinater, mätta i meter, med 0-punkten satt där körningen började. Utöver detta så klippte de bort ett stycke i början och ett stycke i slutet av varje körning. Sedan spegelvändes hälften av körningarna och de roterades mellan 0 och 360 grader. Detta för att försvåra att a) identifiera området och därmed göra tävlingen för lätt och b) identifiera den verkliga adressen som personen kom ifrån/åkte till. B utfördes för att skydda de personer som ingick i underlaget.

Så två identiska körningar (hur man nu åstadkommer detta) skulle se radikalt olika ut i underlaget. Bilden ovan visar de körningar som en speciell förare utfört. Lägg märke till att de åker slumpvis åt alla håll – så ser det förstås inte ut i verkligheten, de flesta åker hemifrån till vissa fasta punkter och tillbaka igen, så många körningar borde överlappa.

En modell som AXA kunde ha nytta av kan inte vara baserad på var föraren kör utan endast *hur* föraren kör, hur fort personen tar kurvor, hur personen ökar och minskar hastigheten etc. Därför försökte AXA/Kaggle se till att endast denna information gick att utvinna ur data.

Allt slutade i tårar

Det hela gick dock förfärligt snett så att inga utav de modeller som placerade sig högt gick att använda till något användbart. Man identifierade snabbt att det enda sättet att få konkurrenskraftiga resultat var att ändå försöka hitta vilka körningar som matchade andra körningar i databasen, detta kallades Trip Matching och bilden nedan visar ett antal trippar som matchats upp mot  varandra av en annan Kagglare. Min lösning påminner mycket om den lösningen de beskriver. Det faktum att rutten roterats gör att man istället för att titta på koordinaterna tittar på svängar som utförts och sedan försöker man hitta andra rutter som under en viss period har liknande svängar. Den som hade bäst Trip Matching metod vann. Man kunde försöka matcha resultatet med lite verklig Machine Learning, men utan bra Trip Matching så var man såld.

Processorkraft

Det krävdes enorm processorkraft för varje test som utfördes. Varje ändring av koden behövde köra igenom alla tripper för att skapa sitt resultat och mot slutet, när koden var som mest avancerad, så tog detta minst ett dygn. Så det handlade om gör en liten ändring, vänta ett dygn, gör en ny liten ändring, vänta ett dygn. Sedan dess har jag köpt en snabbare dator (6 cores, 12 hyperthreads med 32 GB RAM) men även den nya datorn har för lite RAM för de tävlingar jag jobbar med nu.

På slutet av denna Kaggle så hyrde jag in 3 stycken stora burkar på molnet (Microsoft Azure) för att kunna testa flera olika varianter parallellt…

Varför jag placerade så högt

Om sanningen skall fram så finns det tre huvudingredienser i mitt recept

  1. Jag hade tur när jag valde metod, det fanns tusentals andra där många av dessa hade varit sämre. Tyvärr finns det oftast ingen tydlig riktning att gå i när man får dåligt resultat. Så man måste testa åt ett par olika håll och fortsätta åt det håll som verkar mest lovande, men man famlar i blindo för att varje körning tar så lång tid.
  2. Jag har programmerat i 30 år så ingen algoritm är för krånglig för att implementera. Hade jag en idé kunde jag implementera och provköra den
  3. Jag nötte som *fan*
    Oftast så försöker jag bygga en metod som låter mig iterera rykande fort, för att jag aktivt skall kunna testa olika varianter, det blir mer kreativt och man minns mellan körningarna vad man var ute efter att göra. I den här tävlingen var det svårt.

Just nu tävlar jag i en Kaggle som handlar om resebranschen där jag försöker evolvera fram en lösning.

Machine Learning and Kaggle

Kaggle is a site that hosts different Machine Learning competitions. This blog entry is about the first Kaggle I competed in, what I did and the results thereof (#17 out of 1528).

This blog entry also exists in Swedish, just because I could.

Kaggle describe themselves thusly;

The world’s largest community of data scientists compete to solve your most valuable problems

AXA Driver Telematics Analysis

The goal of AXA Driver Telematics Analysis was to  “Use telematics data to identify a driver signature”. In short, we were giving a huge amount of “drives” (described by GPS coordinates) that had been performed by a large number of drivers. Entries were scored on how well they were able to identify who was driving the car.

There were 1,528 participants in the competition and I was able, after a lot of work, to place at #17. Out of the 16 entries that scored higher, 7 were teams and the rest were by individuals. To my mind, being beaten by a team hurts less than being beaten by an individual. And scoring well as an individual is cooler than scoring well as team. But maybe that’s just me.

I chose to develop all my code by myself, not using any Machine Learning packages in the process. If that seems impractical,  then rest assured that it really really is. But that’s how I learn. Find a cool algorithm and implement it from some kind of paper or Wiki description. It’s often tricky, but I always learn stuff! For this competition, I was trying out some deep learning stuff before it all ended up in tears (see below). But I had to develop my own deep learning (Neural Network stuff) for that to work. Before I threw it all out, that is.

Anonymizing the data

The goal for AXA, who put up $30,000 in winnings (plus whatever money they paid Kaggle for administrating the competition) was to  try to identify when the “wrong” driver was driving a vehicle. For instance when the vehicle had been stolen or lent out.

Kaggle/AXA had mixed the drives from a given driver with random drives from other drivers (incorrect drives). The goal was to identify drives that didn’t belong to the driver. The incorrect drives, having been driven by someone random, were most likely performed in another city or part of the country from the actual drives. So to a lot of drives that were virtually guaranteed to be from the correct driver, you could find parts of the drives that looked exactly the same. When the driver goes to or from work or the local store. Those parts of routes would turn up over and over again, making it easy to determine that they were “positives” (actual drives).

Kaggle/AXA predicted this and masked all the GPS coordinates as relative coordinates measured from the start of the drive. They also removed a short snippet from the start and the end of the drives.Then they mirrored 50% of the drives and rotated them a random angle between 0-360 degrees. All this to make it harder to a) mask identical parts of trips  and b) prevent Kagglers from identifying the actual address from any of the drivers. B was done to preserve the privacy of their customers.

Two identical drives (given that you could drive exactly the same twice, including timing) would look radically different from each other. The image above shows all t he drives from a specific driver. Note how they go off in different angles and never overlap. That’s of course not how it would look in reality, most drives originate from home and go to a specific spot (work) or the reverse (work->home).  Without mirroring and rotation, several routes would overlap.

A model that would be useful to AXA would base it’s predictions on other metrics, like the speed, how speed tends to change (acceleration and retardation), how the driver handles curves and so on. For that reason, AXA/Kaggle tried to force Kagglers to use that kind of data.

It all ended in tears

It all went terribly wrong, though. No models could place well using the metrics useful to AXA. It turned out that the only way to compete was to try to reverse the masking/anonymizing performed by AXA/Kaggle to figure out which drives were performed in areas typical to the main driver and which weren’t (called Trip Matching). If no one had used Trip Matching then the best strategies would possibly have been useful to AXA. But once one Kaggler used it, there was no going back. To compete, we all had to do it. And it turned out that Trip Matching was a fun problem to solve, just not very useful.

The picture below shows trips that have been matched up (by another Kaggler) and how they obviously were performed in the same area. Thus, they belonged to the true driver and could be given a high score.

My solution was very similar to what they describe; instead of comparing coordinates, the route was converted to turns (left and right) and these turns were aligned to other drives trying to find matches. The Kaggler with the best trip matching algorithm would win! You could try to mix this result with some actual Machine Learning, but that typically wasn’t what decided the result.

The reason this isn’t helpful to AXA is that they already knew which drives matched each other in world space, figuring that out didn’t help them. The winning solution was whoever was better at reversing the AXE/Kaggle obfuscation. Decidedly not helpful, but that what the competition became. I’m absolutely not saying this was wrong of the Kagglers – or that Kaggle or AXA did anything wrong. It’s just that we found a weakness and exploited it. This happens quite frequently and is something Kaggle/the competition owners try to prevent by masking the data. It’s not against the rules to exploit weaknesses though.

Processor power

This competition required huge amounts of processing power. The data ran into gigabytes and every hyperthread was sorely needed. Each new iteration of my algorithm took about 24 hours to run. So I would change one thing and run my tests. 24 hourse later I would evaluate the result of that change and either revert it and make another change, or move forward with the new change tweaking something else. Imagine playing tennis with one day between the hits – it’s hard to get a good creative flow.

Iterating quickly is actually extremely important, because it allows you to test many more variants – but in this case (at least for me) that was not to be. Since this competition I’ve bought a faster computer (6 cores, 12 hyperthreads and 32GB or RAM), but already my new computer has too little RAM for the competitions I’m running.

At the end of this competition, I was renting three large cloud servers (Microsoft Azure) to be able to evaluate several solutions in parallel.

Why I placed where I did

There are three main ingredients to my recipe

  • I was lucky when picking my method. There were thousands of routes I could have gone down, many of which would have been worse. Unfortunately, there’s rarely a clear sign of which way to go forward – as there often is in software development. In these ML competitions, I find I prototype solutions and continue with whichever route seems most fruitful. That very likely leaves you following a sub-optimal route and forcing you to backtrack. In this case, I remember very vividly where I went wrong and how I would have placed higher had I gone a different route at that point. Oh, well.
  • I’ve been programming for 30 years and no algorithm is too difficult to implement. When I have an idea, I can implement and evaluate it. And I can do this very quickly allowing me to rapidly iterate.
  • I worked very hard at this problem. I probably would have made more money working as a consultant for the hours I spend than I had if I won the competition… But the amount I learned was well worth the investment!

Conclusion

Find a method that allows you to iterate quickly. Automate as much as possible. Precompute as much as possible. Find a method that allows you to accurately validate your performance offline, so you don’t have to waste submissions to figure out if you’re doing well.

Right now, I’m competing in a Kaggle related to the travel industry – where I’m trying to evolve a solution. I could use 100* the CPU/Disk power I currently have and still not have enough. But I’m learning every step of the way, and I’m enjoying it immensely! And currently I’m falling quickly in the ratings, but I’m hoping on this last run! Or the one after that…

LearntoCode.biz – Learn to code by solving puzzles

We’ve finally finalized our website for our game Machinist-Fabrique, and the site’s called http://learntocode.biz .
 
Machinist-Fabrique is a game where you learn to code by solving puzzles. The tools at your disposal are different code concepts. It’s fun and easy and suited for kids 10+ (and adults). We believe that learning to code is and will be an essential skill for our children and for ourselves.

Speed up MVC Visual Studio Development

Whenever I work with Visual Studio doing MVC development, I find that starting the web-site from a recompile takes far too long. So I asked my pal Tobias why it’s so fast when he’s doing it. The answer
I don’t do a full rebuild and I don’t attach the debugger
  1. Press Ctrl-F5 to start the website without the debugger
  2. Use the browser
  3. If you make changes,
    1. hit Shift-F6 to compile and update only the project of the file you’re currently in.
    2. if the solution has focus, the entire solution is rebuilt
    3. hit F5 to update the browser with the changes.
  4. If you need to debug attach the debugger by pressing Ctrl+Alt+P

Thanks, Tobias. I’m writing this blogpost so I won’t have to ask you again!

Machinist–Fabrique on IndieDB.com

My upcoming game, Machinist-Fabrique, is getting it’s own Indiedb entry – check it out at the link below (once it’s gone live, that is)

Machinist - Fabrique

Follow

Get every new post delivered to your Inbox.