## Prepare the data

In [1]:
import pandas as pd

# read the data from the csv file
data = pd.read_csv("allCountries.txt", sep='\t', header=None,
                   names=["geonameid","name","asciiname","alternatenames","latitude","longitude","feature_class","feature_code","country","cc2","admin1_code","admin2_code","admin3_code","admin4_code","population","elevation","dem","timezone","modification_date"])

# select the columns of interest
data = data[["latitude","longitude","country"]]

# display the first 5 rows of the DataFrame
print(data.head())

  data = pd.read_csv("allCountries.txt", sep='\t', header=None,


   latitude  longitude country
0  42.64991    1.53335      AD
1  42.58765    1.74028      AD
2  42.61203    1.47364      AD
3  42.52535    1.73343      AD
4  42.52915    1.73362      AD


In [7]:
data.sample(5)

Unnamed: 0,latitude,longitude,country
7226800,68.32394,13.79753,NO
2070657,31.25342,92.09397,CN
11851183,33.98933,-84.96254,US
11123355,34.16244,-104.3105,US
8688032,44.51929,132.92021,RU


In [2]:
data.shape

(12363388, 3)

In [3]:
print(data.isnull().sum())

latitude         0
longitude        0
country      14551
dtype: int64


In [4]:
data = data.dropna()

## Train

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Split the data into training and test sets
X = data[["latitude", "longitude"]]
y = data["country"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a KNN classifier with 5 nearest neighbors
knn = KNeighborsClassifier(n_neighbors=4)

# Fit the classifier to the training data
knn.fit(X_train, y_train)

# Print the accuracy of the model on the test data
print("Accuracy:", knn.score(X_test, y_test))


Accuracy: 0.9970337294839029


## Collect random data

In [9]:
API_KEY = 'YOUR_API_KEY'

In [12]:
import random
import requests
from tqdm import tqdm

# Set the number of coordinates to generate
num_coordinates = 2500

# Create an empty list to store the results
results = []

# Use tqdm to display progress
for i in tqdm(range(num_coordinates)):
    # Generate a random latitude and longitude
    lat = random.uniform(-90, 90)
    lon = random.uniform(-180, 180)

    # Send a request to the geolocation API
    try:
        response = requests.get(f"https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={API_KEY}")
        data = response.json()
        results.append((lat, lon, data['results'][0]['components']['country']))
    except:
        results.append((lat, lon, 'NA'))

# Create a DataFrame from the results
import pandas as pd
df_new = pd.DataFrame(results, columns=["latitude", "longitude", "country"])

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [24:46<00:00,  1.68it/s]


In [20]:
df_new.to_csv('opencage_samples.csv')
print(df_new.sample(5))

       latitude   longitude country
80   -67.057900  151.629167      NA
694   61.930620  -66.216010  Canada
1667   7.481893 -155.699348      NA
1669 -70.585375  -24.491804      NA
2203  70.407657   28.774544  Norway


In [53]:
df_new = pd.read_csv('opencage_samples.csv')
# Load the countryInfo.txt file
with open("countryInfo.txt") as f:
    country_data = f.readlines()

# Create a dictionary to map country names to country codes
country_codes = {}
for line in country_data:
    if not line.startswith("#"):
        data = line.strip().split("\t")
        country_codes[data[4]] = data[0]

# Convert the country names to country codes in the new dataframe
df_new["country"] = df_new["country"].map(country_codes)
df_new = df_new.dropna()

In [54]:
df_new.sample(10)

Unnamed: 0.1,Unnamed: 0,latitude,longitude,country
658,658,45.190044,-93.460327,US
515,515,50.675779,65.306393,KZ
2160,2160,-22.501809,16.604622,
1280,1280,17.219689,54.973136,OM
2178,2178,-1.269204,106.984503,ID
2298,2298,29.857824,-98.468546,US
166,166,34.445968,-83.261908,US
2200,2200,0.482515,-58.39255,BR
2376,2376,49.511296,93.285772,MN
1617,1617,44.357108,26.780326,RO


## Re-evaluate the model

In [55]:
from sklearn.metrics import accuracy_score

# predict the country of the coordinates in the new dataframe
predictions = knn.predict(df_new[["latitude", "longitude"]])

# calculate the accuracy of the model on the new dataframe
accuracy = accuracy_score(df_new["country"], predictions)
print(f"Model accuracy: {accuracy}")

Model accuracy: 0.986764705882353


## Exploring misses

In [58]:
df_new["predictions"] = predictions
print(df_new.loc[df_new["country"] != df_new["predictions"]])

      Unnamed: 0   latitude   longitude country predictions
336          336 -18.241242   16.706105      NA          AO
931          931 -25.104256   22.713657      BW          ZA
958          958 -23.995538   17.280858      NA          ZA
1348        1348  -9.672760 -138.965349      FR          PF
1457        1457 -25.492231   16.244680      NA          ZA
1674        1674  25.880065  -14.315396      MA          EH
1991        1991  10.419924   23.470560      SD          CF
1999        1999 -49.213029  -73.137265      AR          CL
2160        2160 -22.501809   16.604622      NA          BW
