Given one million points in a three dimensional space, this program prints the ‘k’ nearest points to a given point.

In this implementation, one million 3D points, a reference point with respect to which k-nearest neighbors are to be found and the ‘k’ value are randomly generated by the program. The algorithm is based on max-heap implementation.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
#!/usr/bin/python2 """ @author: Vasanthi Vuppuluri Date created: May 3, 2015 Purpose: -------- - To find the k-nearest neighbors from a given point, in a set of 1 million points in a 3D space """ import itertools import math import random # One million random 3D co-ordinates, a 3D co-ordinate to be used as a reference and 'k' are generated def generate_million_coordinates(): try: x = random.sample(range(0,1000), 100) # A list of 100 unique integers y = random.sample(range(0,1000), 100) z = random.sample(range(0,1000), 100) # A tuple = co-ordinate with reference to which k-nearest neighbors are to be found _point = tuple(random.sample(range(0,1000), 3)) # 'k' value is randomly selected k = random.choice(range(10,1000)) except ValueError: print "ERROR: Sample size exceeded population size!" # One million 3D co-ordinates are created # Each co-ordinate is of type 'tuple' _million_3D_coordinates = list(itertools.product(x,y,z)) return _million_3D_coordinates, _point, k # Distance from reference point to each of the one million co-ordinates is calculated # Distance formula used here = sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2 + (z1 - z2) ** 2) def distances(_million_3D_coordinates, _point): dict_of_distances = {} # A dictionary to store distance of all one million co-ordinates from reference _point list_of_distances = [] # A list to store distance of all one million co-ordinates from reference _point for _coordinate in _million_3D_coordinates: _distance = math.sqrt((_point[0] - _coordinate[0]) ** 2 + (_point[1] - _coordinate[1]) ** 2 + (_point[2] - _coordinate[2]) ** 2) dict_of_distances[_distance] = _coordinate list_of_distances.append(_distance) _distance = 0 return dict_of_distances, list_of_distances # Max-heap function to shift the max value to the root position of the heap when a new element is appended at the end of the heap list def max_heap_shift_up(heap_list_of_k_elements, _position_of_root): while _position_of_root / 2 > 0: if heap_list_of_k_elements[_position_of_root] > heap_list_of_k_elements[_position_of_root / 2]: temp = heap_list_of_k_elements[_position_of_root / 2] heap_list_of_k_elements[_position_of_root] = heap_list_of_k_elements[_position_of_root / 2] heap_list_of_k_elements[_position_of_root / 2] = temp _position_of_root = _position_of_root / 2 return heap_list_of_k_elements # Max-heap function to insert a new element into the heap list def max_heap_insert_element(heap_list_of_k_elements, _distance): heap_list_of_k_elements.append(_distance) _current_length_of_heap_list = len(heap_list_of_k_elements) - 1 heap_list_of_k_elements = max_heap_shift_up(heap_list_of_k_elements, _current_length_of_heap_list) return heap_list_of_k_elements # Max-heap function to find the position of the maximum child in the heap def max_heap_max_child_position(heap_list_of_k_elements, _position): if _position * 2 + 1 > len(heap_list_of_k_elements) - 1: return _position * 2 else: if heap_list_of_k_elements[_position * 2] > heap_list_of_k_elements[_position * 2 + 1]: return _position * 2 else: return _position * 2 + 1 # Max-heap function to re-organize the heap elements after a deletion, such that maximum element is at the root def max_heap_shift_down(heap_list_of_k_elements, _position_of_root): while _position_of_root * 2 <= len(heap_list_of_k_elements): _max_child_position = max_heap_max_child_position(heap_list_of_k_elements, _position_of_root) if heap_list_of_k_elements[_position_of_root] < heap_list_of_k_elements[_max_child_position]: temp = heap_list_of_k_elements[_position_of_root] heap_list_of_k_elements[_position_of_root] = heap_list_of_k_elements[_max_child_position] heap_list_of_k_elements[_max_child_position] = temp _position_of_root = _max_child_position return heap_list_of_k_elements # Max-heap function to delete the root node from the heap def max_heap_delete_max(heap_list_of_k_elements): # Maximum value is present at the root, in this case, second element of the list, since first is '0' # We swap maximum value with last element in the heap list and pop the last element to delete the root heap_list_of_k_elements[1] = heap_list_of_k_elements[-1] # Swap's second element with the last element del heap_list_of_k_elements[-1] # Deletes the last list element, which is the smallest element in the heap # Since, root is swapped is smallest element, we need to shift it down to the correct position heap_list_of_k_elements = max_heap_shift_down(heap_list_of_k_elements, 1) return heap_list_of_k_elements # Max Heap # Once the distances are calculated a max-heap is to built with 'k' heap elements, in this case distances # Once max-heap is created, a new element is inserted into the heap and maximum is deleted # Above step is repeated n-k times # In the end we'll be left with k smallest distances => k-nearest neighbors def build_max_heap(list_of_distances, k): heap_list_of_k_elements_raw = [] # A list to store 'k + 1' heap elements, with one element being '0' # Max-heap of first 'k' elements from the list of distances and the element '0' at 0th position of the heap list heap_list_of_k_elements_raw.append(0) heap_list_of_k_elements_raw += list_of_distances[:k] length_of_heap_list = len(heap_list_of_k_elements_raw) - 1 while length_of_heap_list > 0: heap_list_of_k_elements_sorted = max_heap_shift_up(heap_list_of_k_elements_raw, length_of_heap_list) length_of_heap_list =- 1 if verbose_output == 'Y': if len(heap_list_of_k_elements_raw[1:]) == k: print "Initial heap built with first '%d' distances is successfully built" %(k) # The rest of n-k elements are inserted one by one into the heap list # After an element is inserted, heap is sorted and root is deleted # This process deleted n-k maximum distances from the max-heap leaving behind 'k' smallest distances in the heap # The co-ordinates corresponding to these 'k' distances are the k-nearest neighbors for _distance in list_of_distances[k:]: heap_list_of_k_elements_sorted = max_heap_insert_element(heap_list_of_k_elements_sorted, _distance) heap_list_of_k_elements_sorted = max_heap_delete_max(heap_list_of_k_elements_sorted) if verbose_output == 'Y': if len(heap_list_of_k_elements_sorted[1:]) == k: print "Heap of '%d' smallest distances is built" %(k) return heap_list_of_k_elements_sorted[1:] if __name__ == "__main__": verbose_output = raw_input("Do you want to see a verbose output? (Y|N) ").upper() _million_3D_coordinates, _point, k = generate_million_coordinates() dict_of_distances, list_of_distances = distances(_million_3D_coordinates, _point) if verbose_output == 'Y': if len(_million_3D_coordinates) == 1000000: print "One million 3D points randomly generated" print "'k' (10 <= k <= 1000) value randomly generated is: %d\n" %(k) list_of_k_smallest_distances = build_max_heap(list_of_distances, k) if len(list_of_k_smallest_distances) == k: for _distance in list_of_k_smallest_distances: print dict_of_distances[_distance] |

## Leave a Reply