Code from Expedia Hotel Recommendations
June 12, 2016 Leave a comment
using System; using System.Collections.Generic; using System.Linq; using Boltzman.Core.Utilities; using Boltzman.Core.Utilities.Csv; using Boltzman.Tests; using NUnit.Framework; namespace Boltzman.ExpediaHotelRecommendations { //Getting leaked info 23.434s //Total: 0.2288588(1976304), Specific: 0.8881389(509261) [TestFixture] public class ConstantDistanceFinder : TestFixtureBase { public static int MaxRows = 600 * 1000; //public static int MaxRows = int.MaxValue; public float LimitForClose = 230; public float LimitForFar = 30; public int MinimumHits = 1; public float Epsilon = 0.03f; public float MatchEpsilon = 0.031f; // Total: 0.01394861(600000), Specific: 0.6348937(13182), Count: 6159 => 0.51051 // Total: 0.02265029(600000), Specific: 0.5906204(23010), Count: 14449 => 0.51772 // Total: 0.03969542(600000), Specific: 0.4976858(47856), Count: 121121 => 0.52171 // Total: 0.03984083(600000), Specific: 0.4974508(48054), Count: 124630 public int HitsFound; public static bool UseLeak = false; public static string CitiesWithConstantDistancesFileName = "CitiesWithConstantDistances.txt"; public static string SubmissionFileName = "WideLeak.csv"; public static string HotelsAndDistancesFileName = "hotels_and_distances.csv"; private readonly List<string> _hits = new List<string>(); [Test] public void Double() { FindCitiesWithConstantDistanceDifferences(); Verify(); } [Test] public void FullDoubleWithSubmissionAndMix() { /* HitsFound:7709628 Verify MatchEpsilon: 0.01 Loaded 11186881 train Loaded 1976304 validation Getting leaked info 23.645s Loading cities searches=24145292, cities=10283, cityDictionary=10283 LoadConstantDistances 7709633 loaded Computing map... Total: 0.3378158(1976304), Specific: 0.7528934(886748), Count: 7709633 */ UseLeak = true; MaxRows = int.MaxValue; FindCitiesWithConstantDistanceDifferences(); Verify(); BuildSubmission(); MixIntoSubmission(); } [Test] public void MixIntoSubmission() { new SubmissionMixer().Mix("WideLeak.csv", "aggregator 0.50566.csv", "WideLeakAggregator.csv"); } [Test] public void BuildSubmission() { UseLeak = true; MaxRows = int.MaxValue; Console.WriteLine("BuildSubmission"); List<_2_PreprocessSearches.Search> train = _2_PreprocessSearches.LoadBinTrainSearches().ToList(); Console.WriteLine($"Loaded {train.Count} train"); List<_2_PreprocessSearches.Search> test = _2_PreprocessSearches.LoadBinTestSearches().ToList(); Console.WriteLine($"Loaded {test.Count} test"); _2_PreprocessSearches.GetLeak(train, test); Dictionary<HashableObjectList, City> cityDictionary = LoadCities(); LoadConstantDistances(cityDictionary); //List<List<byte>> selectedClusters = new List<List<byte>>(); List<List<byte>> selectedClusters = test.ParallelSelect(search => { var labels = GetWideLeakLabels(cityDictionary, search); _2_PreprocessSearches.AddLeakToLabelList(search, labels); return labels.Select(x => (byte)x).ToList(); }); //foreach (_2_PreprocessSearches.Search search in test) //{ // var labels = GetWideLeakLabels(cityDictionary, search); // _2_PreprocessSearches.AddLeakToLabelList(search, labels); // selectedClusters.Add(labels.Select(x => (byte)x).ToList()); //} _6_Aggregator2.SaveSubmission(selectedClusters, test, SubmissionFileName); } [Test] public void IdentifyHotels() { // For each city, join up hotels given the distances computed Console.WriteLine("IdentifyHotels"); Dictionary<HashableObjectList, City> cityDictionary = LoadCities(); LoadConstantDistances(cityDictionary); Dictionary<Distance, Hotel> distanceHotelDictionary = new Dictionary<Distance, Hotel>(); HashSet<Hotel> hotels = new HashSet<Hotel>(); foreach (City firstCity in cityDictionary.Values) { foreach (ConstantDistanceCitySearchDestination constantDistanceCitySearchDestination in firstCity.ConstantDistanceCitySearchDestinations) { City secondCity = constantDistanceCitySearchDestination.City; // Now step along both arrays to figure out which cities line up CitySearchDestination firstCityDestinations = firstCity.SearchDestinations.SafeGetValue(constantDistanceCitySearchDestination.srch_destination_id); CitySearchDestination secondCityDestinations = secondCity.SearchDestinations.SafeGetValue(constantDistanceCitySearchDestination.srch_destination_id); if (firstCityDestinations == null || secondCityDestinations == null) { continue; } int fi = 0; int si = 0; while (true) { if (fi >= firstCityDestinations.Distances.Count) { break; } if (si >= secondCityDestinations.Distances.Count) { break; } var first = firstCityDestinations.Distances[fi]; var second = secondCityDestinations.Distances[si]; double delta = (first.orig_destination_distance + constantDistanceCitySearchDestination.Delta - second.orig_destination_distance); if (Math.Abs(delta) < MatchEpsilon && first.WinningCluster == second.WinningCluster) { var firstHotel = distanceHotelDictionary.SafeGetValue(first); var secondHotel = distanceHotelDictionary.SafeGetValue(second); if (firstHotel == null && secondHotel == null) { Hotel hotel = new Hotel { srch_destination_id = constantDistanceCitySearchDestination.srch_destination_id, HotelCluster = first.WinningCluster }; hotel.Distances.Add(first); hotel.Distances.Add(second); distanceHotelDictionary.Add(first, hotel); distanceHotelDictionary.Add(second, hotel); hotels.Add(hotel); } else if (firstHotel != null && secondHotel != null) { if (firstHotel != secondHotel) { firstHotel.Distances.UnionWith(secondHotel.Distances); secondHotel.Distances.ForEach(x => distanceHotelDictionary[x] = firstHotel); hotels.Remove(secondHotel); } } else { firstHotel?.Distances.Add(second); secondHotel?.Distances.Add(first); } //Console.WriteLine($"{firstCity},{secondCity},{constantDistanceCitySearchDestination.srch_destination_id},{constantDistanceCitySearchDestination.Delta},{first.orig_destination_distance}, {second.orig_destination_distance}"); fi++; si++; } else if (Math.Abs(delta) < MatchEpsilon) { if (fi < firstCityDestinations.Distances.Count - 1 && firstCityDestinations.Distances[fi].WinningCluster == second.WinningCluster) { // Try to step into a match fi++; } else { // Try to step into a match si++; } } else { if (delta < 0) { fi++; } else { si++; } } } } } Console.WriteLine($"Initially created {hotels.Count} hotels"); while (MergeHotels(hotels)) { Console.WriteLine($"Merged down to {hotels.Count} hotels"); } Console.WriteLine($"Ended up with {hotels.Count} hotels with {hotels.Sum(h => h.Distances.Count)} distances"); int id = 0; hotels.ForEach(x => x.Id = id++); hotels.Select(x => x.ToString()).WriteAllLines(Settings.BasePath + HotelsAndDistancesFileName); } private bool MergeHotels(HashSet<Hotel> hotels) { bool thereWereMergers = false; Dictionary<Distance, Hotel> dictionary = new Dictionary<Distance, Hotel>(); var list = hotels.ToList(); for (int i = 0; i < list.Count - 1; i++) { var currentHotel = list[i]; foreach (Distance distance in currentHotel.Distances.ToList()) { Hotel testHotel = dictionary.SafeGetValue(distance); if (testHotel == null || testHotel == currentHotel) { continue; } currentHotel.Distances.UnionWith(testHotel.Distances); hotels.Remove(testHotel); thereWereMergers = true; } currentHotel.Distances.ForEach(x => dictionary[x] = currentHotel); } return thereWereMergers; } public class Hotel { public int srch_destination_id { get; set; } public int Id { get; set; } public int HotelCluster { get; set; } public HashSet<Distance> Distances { get; set; } = new HashSet<Distance>(); public override string ToString() { return $"{srch_destination_id}," + $"{Id}," + $"{HotelCluster}," + $"{Distances.Select(x => $"{x.City}@{x.orig_destination_distance}").SJoin("|")}"; } } [Test] public void Verify() { Console.WriteLine("Verify"); Console.WriteLine($"MatchEpsilon: {MatchEpsilon}"); List<_2_PreprocessSearches.Search> train = _2_PreprocessSearches.LoadTrainTraining().Take(MaxRows).ToList(); Console.WriteLine($"Loaded {train.Count} train"); List<_2_PreprocessSearches.Search> validation = _2_PreprocessSearches.LoadTrainValidation().Take(MaxRows).ToList(); Console.WriteLine($"Loaded {validation.Count} validation"); if (UseLeak) { _2_PreprocessSearches.GetLeak(train, validation); } Dictionary<HashableObjectList, City> cityDictionary = LoadCities(); LoadConstantDistances(cityDictionary); Accumulator total = new Accumulator(); Accumulator specific = new Accumulator(); Console.WriteLine("Computing map..."); validation.ParallelForEach(search => { if (search.orig_destination_distance <= 0) { total.LockedAddValue(0); return; } List<int> labels = GetWideLeakLabels(cityDictionary, search); _2_PreprocessSearches.AddLeakToLabelList(search, labels); float precision = AveragePrecision.OneOf5ComputeAveragePrecision(search.hotel_cluster.Value, labels); if (labels.Any()) { specific.LockedAddValue(precision); } total.LockedAddValue(precision); }); Console.WriteLine($"Total: {total}, Specific: {specific}, Count: {cityDictionary.Values.Sum(x => x.ConstantDistanceCitySearchDestinations.Count)}"); } private List<int> GetWideLeakLabels(Dictionary<HashableObjectList, City> cityDictionary, _2_PreprocessSearches.Search search) { List<int> labels = new List<int>(); // Look deeper! City city = cityDictionary.SafeGetValue(City.GetCountryRegionCityHash(search.user_location_country, search.user_location_region, search.user_location_city)); List<ConstantDistanceCitySearchDestination> others = city?.ConstantDistanceCitySearchDestinations.Where(x => x.srch_destination_id == GetSearchDestinationId(search)).ToList(); if (city != null && others != null) { // Are there any hits for the other city that are appropriate? InstanceCounter<int> counter = new InstanceCounter<int>(); others.ForEach(constantDistanceCitySearchDestination => { CitySearchDestination destination = constantDistanceCitySearchDestination.City?.SearchDestinations.SafeGetValue(GetSearchDestinationId(search)); if (destination != null) { double pos = search.orig_destination_distance + constantDistanceCitySearchDestination.Delta; List<Distance> hits = destination .Distances .Where(x => Math.Abs(x.orig_destination_distance - pos) < MatchEpsilon) .ToList(); if (hits.Any()) { //Console.WriteLine("Found matches"); lock (counter) { counter.AddCounts(hits, h => h.WinningCluster); } } } }); if (counter.Count > 0) { //Console.WriteLine("Found matches"); labels.AddRange(counter.GetSortedCounts().Select(x => x.Value).Distinct().Take(5)); } } return labels; } private int GetSearchDestinationId(_2_PreprocessSearches.Search search) { //return search.srch_destination_id_base; return search.srch_destination_id; } private Dictionary<HashableObjectList, City> LoadCities() { Console.Write("Loading cities"); List<_2_PreprocessSearches.Search> searches = _2_PreprocessSearches.LoadBinTrainSearches().Where(x => x.orig_destination_distance > 0).Take(MaxRows).ToList(); Console.Write($" searches={searches.Count}"); List<City> cities = ExtractCities(searches); Console.Write($", cities={cities.Count}"); Dictionary<HashableObjectList, City> cityDictionary = cities.ToDictionary(x => x.CountryRegionCityHash, x => x); Console.WriteLine($", cityDictionary={cityDictionary.Count}"); return cityDictionary; } private static void LoadConstantDistances(Dictionary<HashableObjectList, City> cityDictionary) { Console.Write("LoadConstantDistances"); foreach (CsvRow csvRow in CsvFile.LoadRows(Settings.BaseSvnPath + CitiesWithConstantDistancesFileName).Take(MaxRows)) { // 0 1 2 3 4 5 6 7 // 66,442,3602,66,442,44774,383,-18.5861 // "{destination1.City.user_location_country},{destination1.City.user_location_region},{destination1.City.user_location_city},{destination2.City.user_location_city},{destination2.srch_destination_id},{hitOnDelta:0.####}" /* $"{destination1.City.posa_continent},{destination1.City.user_location_country},{destination1.City.user_location_region},{destination1.City.user_location_city}," + $"{destination2.City.posa_continent},{destination2.City.user_location_country},{destination2.City.user_location_region},{destination2.City.user_location_city}," + $"{destination2.srch_destination_id},{delta:0.####},{error:0.####}");;*/ var fromCity = cityDictionary.SafeGetValue(City.GetCountryRegionCityHash(Convert.ToByte(csvRow[0]), Convert.ToInt16(csvRow[1]), Convert.ToUInt16(csvRow[2]))); var toCity = cityDictionary.SafeGetValue(City.GetCountryRegionCityHash(Convert.ToByte(csvRow[3]), Convert.ToInt16(csvRow[4]), Convert.ToUInt16(csvRow[5]))); if (fromCity != null && toCity != null) { fromCity.ConstantDistanceCitySearchDestinations .Add( new ConstantDistanceCitySearchDestination { srch_destination_id = Convert.ToInt32(csvRow[6]), Delta = Convert.ToDouble(csvRow[7]), City = toCity }); } } Console.WriteLine($" {cityDictionary.Values.Sum(x => x.ConstantDistanceCitySearchDestinations.Count)} loaded"); } [Test] public void FindCitiesWithConstantDistanceDifferences() { Console.WriteLine("FindCitiesWithConstantDistanceDifferences"); List<_2_PreprocessSearches.Search> searches = _2_PreprocessSearches.LoadBinTrainSearches().Where(x => x.orig_destination_distance > 0).Take(MaxRows).ToList(); Console.WriteLine($"searches:{searches.Count}"); List<City> cities = ExtractCities(searches); Console.WriteLine($"cities:{cities.Count}"); HitsFound = 0; //ShowCities(cities); TimeLeftAproximator aproximator = new TimeLeftAproximator(cities.Count); ILookup<HashableObjectList, City> proximityGrouped = cities.ToLookup(c => c.ProximityGroupingHash); cities.ParallelForEach(primaryCity => { //if (MaxRows == int.MaxValue) //{ // Console.WriteLine($"Processing {primaryCity}"); //} InstanceCounter<int> matchCount = new InstanceCounter<int>(); foreach (City secondaryCity in proximityGrouped[primaryCity.ProximityGroupingHash]) { if (primaryCity == secondaryCity) { continue; } if (!primaryCity.CloseSearchDestinations.Overlaps(secondaryCity.CloseSearchDestinations)) { continue; } if (!primaryCity.FarSearchDestinations.Overlaps(secondaryCity.FarSearchDestinations)) { continue; } List<int> farSearchDestinations = primaryCity.FarSearchDestinations.Intersect(secondaryCity.FarSearchDestinations).ToList(); //Console.WriteLine($"{primaryCity} possible match for {secondaryCity} on {farSearchDestinations.Count()} destinations"); foreach (int farSearchDestination in farSearchDestinations) { CitySearchDestination primaryDestination = primaryCity.SearchDestinations[farSearchDestination]; CitySearchDestination secondaryDestination = secondaryCity.SearchDestinations[farSearchDestination]; if (primaryDestination.AverageDistance - secondaryDestination.AverageDistance > LimitForClose * 2/* || primaryDestination.Distances.Count > secondaryDestination.Distances.Count * MaxDistanceDifferenceFactor || secondaryDestination.Distances.Count > primaryDestination.Distances.Count * MaxDistanceDifferenceFactor*/) { //Console.WriteLine("Skipped - too far apart"); break; } //Console.WriteLine($" {farSearchDestination}:"); if (FindMatch(primaryDestination, secondaryDestination)) { matchCount.AddCount1(farSearchDestination); } //Console.WriteLine($" primary : {primaryDestination.Distances.Select(x => x.HotelClusterCounter.GetWinner().Value).SJoin()}"); //Console.WriteLine($" secondary: {secondaryDestination.Distances.Select(x => x.HotelClusterCounter.GetWinner().Value).SJoin()}"); } } if (MaxRows == int.MaxValue) { lock (this) { aproximator.IterationFinished(); if (aproximator.ItemsLeft % 50 == 0) { Console.WriteLine($"{100 - 100f * aproximator.ItemsLeft / cities.Count:0}% ({aproximator.ItemsLeft}), {aproximator.MinutesLeft:0.0}m"); } } } }); _hits.WriteAllLines(Settings.BaseSvnPath + CitiesWithConstantDistancesFileName); Console.WriteLine($"\nHitsFound:{HitsFound}"); } private bool FindMatch(CitySearchDestination destination1, CitySearchDestination destination2) { //primary: 80, 80, 80, 80, 80, 80, 80, 27 //secondary: 80, 80, 80, 80, 80, 3, 12, 51, 51, 12, 10, 61, 50, 76, 57, 27 List<Distance> list = destination1.Distances.ToList(); for (int i = 0; i < list.Count - 1; i++) { Distance firstSource = list[i]; int firstCluster = firstSource.WinningCluster; List<double> firstHitDistances = destination2.ClusterDistances.SafeGetValue(firstCluster); if (firstHitDistances == null || !firstHitDistances.Any()) { continue; } int hits = 0; double previousHitOnDelta = 0; List<double> potentialJumps = firstHitDistances.Select(firstHitDistance => firstHitDistance - firstSource.orig_destination_distance).ToList(); for (int j = i + 1; j < list.Count; j++) { Distance secondSource = list[j]; int secondCluster = secondSource.WinningCluster; List<double> secondHitDistances = destination2.ClusterDistances.SafeGetValue(secondCluster); if (secondHitDistances == null || !secondHitDistances.Any()) { continue; } foreach (double secondHitDistance in secondHitDistances) { double hitOnDelta = potentialJumps .FirstOrDefault(delta => Math.Abs(secondHitDistance - secondSource.orig_destination_distance - delta) < Epsilon); if (Math.Abs(hitOnDelta) < float.Epsilon) { continue; } if (MinimumHits == 1) { previousHitOnDelta = hitOnDelta; } if (Math.Abs(previousHitOnDelta) > float.Epsilon && Math.Abs(previousHitOnDelta - hitOnDelta) > Epsilon) { continue; } hits++; if (hits >= MinimumHits) { double delta = (hitOnDelta + previousHitOnDelta) * 0.5; double error = Math.Abs(hitOnDelta - delta) + Math.Abs(previousHitOnDelta - delta); lock (_hits) { _hits.Add( $"{destination1.City.user_location_country},{destination1.City.user_location_region},{destination1.City.user_location_city}," + $"{destination2.City.user_location_country},{destination2.City.user_location_region},{destination2.City.user_location_city}," + $"{destination2.srch_destination_id},{delta:0.####},{error:0.####}"); //if (_hits.Count % 50000 == 0) //{ // _hits.WriteAllLines(Settings.BaseSvnPath + CitiesWithConstantDistancesFileName); //} } HitsFound++; return true; } previousHitOnDelta = hitOnDelta; } } } return false; } private static void ShowCities(List<City> cities) { Console.WriteLine($"Found {cities.Count} cities"); foreach (City city in cities.Take(10)) { Console.WriteLine($"city={city}"); Console.WriteLine($" close: {city.CloseSearchDestinations.Take(10).SJoin()}"); Console.WriteLine($" far: {city.FarSearchDestinations.Take(10).SJoin()}"); foreach (CitySearchDestination citySearchDestination in city.SearchDestinations.Values.Take(10)) { Console.WriteLine($" dest={citySearchDestination.srch_destination_id}, avg.dist={citySearchDestination.AverageDistance}"); foreach (Distance distance in citySearchDestination.Distances.Take(10)) { Console.WriteLine( $" x={distance.orig_destination_distance}, clust={distance.HotelClusterCounter.GetSortedCounts().Take(2).Select(x => x.Value).SJoin()}"); } } } } private List<City> ExtractCities(List<_2_PreprocessSearches.Search> searches) { List<City> cities = new List<City>(); searches.GroupBy(x => new { x.user_location_city, x.user_location_region, x.user_location_country }) .ParallelForEach( grouping => { _2_PreprocessSearches.Search first = grouping.First(); City city = new City(first.posa_continent, first.user_location_country, first.user_location_region, first.user_location_city); foreach (IGrouping<int, _2_PreprocessSearches.Search> grouping2 in grouping.GroupBy(GetSearchDestinationId)) { CitySearchDestination destination = new CitySearchDestination { srch_destination_id = GetSearchDestinationId(grouping2.First()), City = city }; foreach (IGrouping<double, _2_PreprocessSearches.Search> searches1 in grouping2.GroupBy(x => x.orig_destination_distance)) { var sfirst = searches1.First(); Distance distance = new Distance(city) { orig_destination_distance = sfirst.orig_destination_distance }; distance.HotelClusterCounter.AddCounts(searches1, s => s.hotel_cluster.Value); destination.Distances.Add(distance); } destination.Distances.SortBy(x => x.orig_destination_distance); destination.AverageDistance = destination.Distances.Average(x => x.orig_destination_distance); destination.SearchDestinationSize = destination.Distances.Max(x => x.orig_destination_distance) - destination.Distances.Min(x => x.orig_destination_distance); //if (destination.Distances.Count < MinDistances) //{ // break; //} if (destination.AverageDistance < LimitForClose) { city.CloseSearchDestinations.Add(destination.srch_destination_id); } if (destination.AverageDistance > LimitForFar) { city.FarSearchDestinations.Add(destination.srch_destination_id); city.SearchDestinations.Add(destination.srch_destination_id, destination); destination.CreateClusterDistances(); } } if (city.CloseSearchDestinations.Any() && city.FarSearchDestinations.Any()) { cities.LockedAdd(city); } }); return cities; } public class City { public City(byte posa_continent, byte user_location_country, short user_location_region, ushort user_location_city) { this.posa_continent = posa_continent; this.user_location_country = user_location_country; this.user_location_region = user_location_region; this.user_location_city = user_location_city; //ProximityGroupingHash = new HashableObjectList(posa_continent); ProximityGroupingHash = new HashableObjectList(user_location_country); //ProximityGroupingHash = new HashableObjectList(user_location_country, user_location_region); CountryRegionCityHash = GetCountryRegionCityHash(user_location_country, user_location_region, user_location_city); } public HashableObjectList ProximityGroupingHash { get; set; } public HashableObjectList CountryRegionCityHash { get; set; } public Dictionary<int, CitySearchDestination> SearchDestinations { get; set; } = new Dictionary<int, CitySearchDestination>(); public byte user_location_country { get; set; } public short user_location_region { get; set; } public ushort user_location_city { get; set; } public byte posa_continent { get; set; } public HashSet<int> CloseSearchDestinations { get; set; } = new HashSet<int>(); public HashSet<int> FarSearchDestinations { get; set; } = new HashSet<int>(); public List<ConstantDistanceCitySearchDestination> ConstantDistanceCitySearchDestinations = new List<ConstantDistanceCitySearchDestination>(); public override string ToString() { return $"{user_location_country}/{user_location_region}/{user_location_city}"; } public static HashableObjectList GetCountryRegionCityHash(byte user_location_country, short user_location_region, ushort user_location_city) { return new HashableObjectList( user_location_country, user_location_region, user_location_city); } } public class ConstantDistanceCitySearchDestination { public int srch_destination_id { get; set; } public double Delta { get; set; } public City City { get; set; } } public class CitySearchDestination { public int srch_destination_id { get; set; } public List<Distance> Distances { get; set; } = new List<Distance>(); public double AverageDistance { get; set; } public double SearchDestinationSize { get; set; } public Dictionary<int, List<double>> ClusterDistances { get; set; } = new Dictionary<int, List<double>>(); public City City { get; set; } public void CreateClusterDistances() { foreach (Distance distance in Distances) { foreach (var unsortedCount in distance.HotelClusterCounter.GetUnsortedCounts()) { ClusterDistances.GetOrCreate(unsortedCount.Value, x => new List<double>()).Add(distance.orig_destination_distance); } distance.WinningCluster = distance.HotelClusterCounter.GetWinner().Value; } } } public class Distance { public Distance(City city) { City = city; } public City City { get; set; } public double orig_destination_distance { get; set; } public InstanceCounter<int> HotelClusterCounter { get; set; } = new InstanceCounter<int>(); public int WinningCluster { get; set; } public override string ToString() { return $"{WinningCluster}@{orig_destination_distance}"; } } public HashableObjectList CreateKey(_2_PreprocessSearches.Search search) { return HashableObjectList.Create(search, s => s.user_location_city, s => s.srch_destination_id); } } }