{
"cells": [
{
"cell_type": "markdown",
"id": "b6a0ec93",
"metadata": {},
"source": [
"## Prepare the data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "2a44b2be",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/3v/xcpv0w3x47d9mtww8gt5w_vh0000gn/T/ipykernel_37788/1821662786.py:4: DtypeWarning: Columns (9,10,11,12,13) have mixed types. Specify dtype option on import or set low_memory=False.\n",
" data = pd.read_csv(\"allCountries.txt\", sep='\\t', header=None,\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
" latitude longitude country\n",
"0 42.64991 1.53335 AD\n",
"1 42.58765 1.74028 AD\n",
"2 42.61203 1.47364 AD\n",
"3 42.52535 1.73343 AD\n",
"4 42.52915 1.73362 AD\n"
]
}
],
"source": [
"import pandas as pd\n",
"\n",
"# read the data from the csv file\n",
"data = pd.read_csv(\"allCountries.txt\", sep='\\t', header=None,\n",
" names=[\"geonameid\",\"name\",\"asciiname\",\"alternatenames\",\"latitude\",\"longitude\",\"feature_class\",\"feature_code\",\"country\",\"cc2\",\"admin1_code\",\"admin2_code\",\"admin3_code\",\"admin4_code\",\"population\",\"elevation\",\"dem\",\"timezone\",\"modification_date\"])\n",
"\n",
"# select the columns of interest\n",
"data = data[[\"latitude\",\"longitude\",\"country\"]]\n",
"\n",
"# display the first 5 rows of the DataFrame\n",
"print(data.head())"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ee9987a5",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" latitude | \n",
" longitude | \n",
" country | \n",
"
\n",
" \n",
" \n",
" \n",
" 7226800 | \n",
" 68.32394 | \n",
" 13.79753 | \n",
" NO | \n",
"
\n",
" \n",
" 2070657 | \n",
" 31.25342 | \n",
" 92.09397 | \n",
" CN | \n",
"
\n",
" \n",
" 11851183 | \n",
" 33.98933 | \n",
" -84.96254 | \n",
" US | \n",
"
\n",
" \n",
" 11123355 | \n",
" 34.16244 | \n",
" -104.31050 | \n",
" US | \n",
"
\n",
" \n",
" 8688032 | \n",
" 44.51929 | \n",
" 132.92021 | \n",
" RU | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" latitude longitude country\n",
"7226800 68.32394 13.79753 NO\n",
"2070657 31.25342 92.09397 CN\n",
"11851183 33.98933 -84.96254 US\n",
"11123355 34.16244 -104.31050 US\n",
"8688032 44.51929 132.92021 RU"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.sample(5)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a24a538d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(12363388, 3)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "13daf34e",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"latitude 0\n",
"longitude 0\n",
"country 14551\n",
"dtype: int64\n"
]
}
],
"source": [
"print(data.isnull().sum())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "afe982b3",
"metadata": {},
"outputs": [],
"source": [
"data = data.dropna()"
]
},
{
"cell_type": "markdown",
"id": "a26b919e",
"metadata": {},
"source": [
"## Train"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "45b09303",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.9970337294839029\n"
]
}
],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"\n",
"# Split the data into training and test sets\n",
"X = data[[\"latitude\", \"longitude\"]]\n",
"y = data[\"country\"]\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
"\n",
"# Create a KNN classifier with 5 nearest neighbors\n",
"knn = KNeighborsClassifier(n_neighbors=4)\n",
"\n",
"# Fit the classifier to the training data\n",
"knn.fit(X_train, y_train)\n",
"\n",
"# Print the accuracy of the model on the test data\n",
"print(\"Accuracy:\", knn.score(X_test, y_test))\n"
]
},
{
"cell_type": "markdown",
"id": "70a1290c",
"metadata": {},
"source": [
"## Collect random data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "d45aee36",
"metadata": {},
"outputs": [],
"source": [
"API_KEY = 'YOUR_API_KEY'"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fbd11bf3",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [24:46<00:00, 1.68it/s]\n"
]
}
],
"source": [
"import random\n",
"import requests\n",
"from tqdm import tqdm\n",
"\n",
"# Set the number of coordinates to generate\n",
"num_coordinates = 2500\n",
"\n",
"# Create an empty list to store the results\n",
"results = []\n",
"\n",
"# Use tqdm to display progress\n",
"for i in tqdm(range(num_coordinates)):\n",
" # Generate a random latitude and longitude\n",
" lat = random.uniform(-90, 90)\n",
" lon = random.uniform(-180, 180)\n",
"\n",
" # Send a request to the geolocation API\n",
" try:\n",
" response = requests.get(f\"https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={API_KEY}\")\n",
" data = response.json()\n",
" results.append((lat, lon, data['results'][0]['components']['country']))\n",
" except:\n",
" results.append((lat, lon, 'NA'))\n",
"\n",
"# Create a DataFrame from the results\n",
"import pandas as pd\n",
"df_new = pd.DataFrame(results, columns=[\"latitude\", \"longitude\", \"country\"])"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "a347489a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" latitude longitude country\n",
"80 -67.057900 151.629167 NA\n",
"694 61.930620 -66.216010 Canada\n",
"1667 7.481893 -155.699348 NA\n",
"1669 -70.585375 -24.491804 NA\n",
"2203 70.407657 28.774544 Norway\n"
]
}
],
"source": [
"df_new.to_csv('opencage_samples.csv')\n",
"print(df_new.sample(5))"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "f784dc35",
"metadata": {},
"outputs": [],
"source": [
"df_new = pd.read_csv('opencage_samples.csv')\n",
"# Load the countryInfo.txt file\n",
"with open(\"countryInfo.txt\") as f:\n",
" country_data = f.readlines()\n",
"\n",
"# Create a dictionary to map country names to country codes\n",
"country_codes = {}\n",
"for line in country_data:\n",
" if not line.startswith(\"#\"):\n",
" data = line.strip().split(\"\\t\")\n",
" country_codes[data[4]] = data[0]\n",
"\n",
"# Convert the country names to country codes in the new dataframe\n",
"df_new[\"country\"] = df_new[\"country\"].map(country_codes)\n",
"df_new = df_new.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "64e404dd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Unnamed: 0 | \n",
" latitude | \n",
" longitude | \n",
" country | \n",
"
\n",
" \n",
" \n",
" \n",
" 658 | \n",
" 658 | \n",
" 45.190044 | \n",
" -93.460327 | \n",
" US | \n",
"
\n",
" \n",
" 515 | \n",
" 515 | \n",
" 50.675779 | \n",
" 65.306393 | \n",
" KZ | \n",
"
\n",
" \n",
" 2160 | \n",
" 2160 | \n",
" -22.501809 | \n",
" 16.604622 | \n",
" NA | \n",
"
\n",
" \n",
" 1280 | \n",
" 1280 | \n",
" 17.219689 | \n",
" 54.973136 | \n",
" OM | \n",
"
\n",
" \n",
" 2178 | \n",
" 2178 | \n",
" -1.269204 | \n",
" 106.984503 | \n",
" ID | \n",
"
\n",
" \n",
" 2298 | \n",
" 2298 | \n",
" 29.857824 | \n",
" -98.468546 | \n",
" US | \n",
"
\n",
" \n",
" 166 | \n",
" 166 | \n",
" 34.445968 | \n",
" -83.261908 | \n",
" US | \n",
"
\n",
" \n",
" 2200 | \n",
" 2200 | \n",
" 0.482515 | \n",
" -58.392550 | \n",
" BR | \n",
"
\n",
" \n",
" 2376 | \n",
" 2376 | \n",
" 49.511296 | \n",
" 93.285772 | \n",
" MN | \n",
"
\n",
" \n",
" 1617 | \n",
" 1617 | \n",
" 44.357108 | \n",
" 26.780326 | \n",
" RO | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Unnamed: 0 latitude longitude country\n",
"658 658 45.190044 -93.460327 US\n",
"515 515 50.675779 65.306393 KZ\n",
"2160 2160 -22.501809 16.604622 NA\n",
"1280 1280 17.219689 54.973136 OM\n",
"2178 2178 -1.269204 106.984503 ID\n",
"2298 2298 29.857824 -98.468546 US\n",
"166 166 34.445968 -83.261908 US\n",
"2200 2200 0.482515 -58.392550 BR\n",
"2376 2376 49.511296 93.285772 MN\n",
"1617 1617 44.357108 26.780326 RO"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new.sample(10)"
]
},
{
"cell_type": "markdown",
"id": "f0f49be5",
"metadata": {},
"source": [
"## Re-evaluate the model"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "a00e2d78",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Model accuracy: 0.986764705882353\n"
]
}
],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
"# predict the country of the coordinates in the new dataframe\n",
"predictions = knn.predict(df_new[[\"latitude\", \"longitude\"]])\n",
"\n",
"# calculate the accuracy of the model on the new dataframe\n",
"accuracy = accuracy_score(df_new[\"country\"], predictions)\n",
"print(f\"Model accuracy: {accuracy}\")"
]
},
{
"cell_type": "markdown",
"id": "58decb87",
"metadata": {},
"source": [
"## Exploring misses"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "11d44c9f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Unnamed: 0 latitude longitude country predictions\n",
"336 336 -18.241242 16.706105 NA AO\n",
"931 931 -25.104256 22.713657 BW ZA\n",
"958 958 -23.995538 17.280858 NA ZA\n",
"1348 1348 -9.672760 -138.965349 FR PF\n",
"1457 1457 -25.492231 16.244680 NA ZA\n",
"1674 1674 25.880065 -14.315396 MA EH\n",
"1991 1991 10.419924 23.470560 SD CF\n",
"1999 1999 -49.213029 -73.137265 AR CL\n",
"2160 2160 -22.501809 16.604622 NA BW\n"
]
}
],
"source": [
"df_new[\"predictions\"] = predictions\n",
"print(df_new.loc[df_new[\"country\"] != df_new[\"predictions\"]])"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}