{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b6a0ec93",
   "metadata": {},
   "source": [
    "## Prepare the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "2a44b2be",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/3v/xcpv0w3x47d9mtww8gt5w_vh0000gn/T/ipykernel_37788/1821662786.py:4: DtypeWarning: Columns (9,10,11,12,13) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  data = pd.read_csv(\"allCountries.txt\", sep='\\t', header=None,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   latitude  longitude country\n",
      "0  42.64991    1.53335      AD\n",
      "1  42.58765    1.74028      AD\n",
      "2  42.61203    1.47364      AD\n",
      "3  42.52535    1.73343      AD\n",
      "4  42.52915    1.73362      AD\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "# read the data from the csv file\n",
    "data = pd.read_csv(\"allCountries.txt\", sep='\\t', header=None,\n",
    "                   names=[\"geonameid\",\"name\",\"asciiname\",\"alternatenames\",\"latitude\",\"longitude\",\"feature_class\",\"feature_code\",\"country\",\"cc2\",\"admin1_code\",\"admin2_code\",\"admin3_code\",\"admin4_code\",\"population\",\"elevation\",\"dem\",\"timezone\",\"modification_date\"])\n",
    "\n",
    "# select the columns of interest\n",
    "data = data[[\"latitude\",\"longitude\",\"country\"]]\n",
    "\n",
    "# display the first 5 rows of the DataFrame\n",
    "print(data.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ee9987a5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>7226800</th>\n",
       "      <td>68.32394</td>\n",
       "      <td>13.79753</td>\n",
       "      <td>NO</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2070657</th>\n",
       "      <td>31.25342</td>\n",
       "      <td>92.09397</td>\n",
       "      <td>CN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11851183</th>\n",
       "      <td>33.98933</td>\n",
       "      <td>-84.96254</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11123355</th>\n",
       "      <td>34.16244</td>\n",
       "      <td>-104.31050</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8688032</th>\n",
       "      <td>44.51929</td>\n",
       "      <td>132.92021</td>\n",
       "      <td>RU</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          latitude  longitude country\n",
       "7226800   68.32394   13.79753      NO\n",
       "2070657   31.25342   92.09397      CN\n",
       "11851183  33.98933  -84.96254      US\n",
       "11123355  34.16244 -104.31050      US\n",
       "8688032   44.51929  132.92021      RU"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a24a538d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(12363388, 3)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "13daf34e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "latitude         0\n",
      "longitude        0\n",
      "country      14551\n",
      "dtype: int64\n"
     ]
    }
   ],
   "source": [
    "print(data.isnull().sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "afe982b3",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = data.dropna()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a26b919e",
   "metadata": {},
   "source": [
    "## Train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "45b09303",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9970337294839029\n"
     ]
    }
   ],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "\n",
    "# Split the data into training and test sets\n",
    "X = data[[\"latitude\", \"longitude\"]]\n",
    "y = data[\"country\"]\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
    "\n",
    "# Create a KNN classifier with 5 nearest neighbors\n",
    "knn = KNeighborsClassifier(n_neighbors=4)\n",
    "\n",
    "# Fit the classifier to the training data\n",
    "knn.fit(X_train, y_train)\n",
    "\n",
    "# Print the accuracy of the model on the test data\n",
    "print(\"Accuracy:\", knn.score(X_test, y_test))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70a1290c",
   "metadata": {},
   "source": [
    "## Collect random data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "d45aee36",
   "metadata": {},
   "outputs": [],
   "source": [
    "API_KEY = 'YOUR_API_KEY'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "fbd11bf3",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [24:46<00:00,  1.68it/s]\n"
     ]
    }
   ],
   "source": [
    "import random\n",
    "import requests\n",
    "from tqdm import tqdm\n",
    "\n",
    "# Set the number of coordinates to generate\n",
    "num_coordinates = 2500\n",
    "\n",
    "# Create an empty list to store the results\n",
    "results = []\n",
    "\n",
    "# Use tqdm to display progress\n",
    "for i in tqdm(range(num_coordinates)):\n",
    "    # Generate a random latitude and longitude\n",
    "    lat = random.uniform(-90, 90)\n",
    "    lon = random.uniform(-180, 180)\n",
    "\n",
    "    # Send a request to the geolocation API\n",
    "    try:\n",
    "        response = requests.get(f\"https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={API_KEY}\")\n",
    "        data = response.json()\n",
    "        results.append((lat, lon, data['results'][0]['components']['country']))\n",
    "    except:\n",
    "        results.append((lat, lon, 'NA'))\n",
    "\n",
    "# Create a DataFrame from the results\n",
    "import pandas as pd\n",
    "df_new = pd.DataFrame(results, columns=[\"latitude\", \"longitude\", \"country\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "a347489a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       latitude   longitude country\n",
      "80   -67.057900  151.629167      NA\n",
      "694   61.930620  -66.216010  Canada\n",
      "1667   7.481893 -155.699348      NA\n",
      "1669 -70.585375  -24.491804      NA\n",
      "2203  70.407657   28.774544  Norway\n"
     ]
    }
   ],
   "source": [
    "df_new.to_csv('opencage_samples.csv')\n",
    "print(df_new.sample(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "f784dc35",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_new = pd.read_csv('opencage_samples.csv')\n",
    "# Load the countryInfo.txt file\n",
    "with open(\"countryInfo.txt\") as f:\n",
    "    country_data = f.readlines()\n",
    "\n",
    "# Create a dictionary to map country names to country codes\n",
    "country_codes = {}\n",
    "for line in country_data:\n",
    "    if not line.startswith(\"#\"):\n",
    "        data = line.strip().split(\"\\t\")\n",
    "        country_codes[data[4]] = data[0]\n",
    "\n",
    "# Convert the country names to country codes in the new dataframe\n",
    "df_new[\"country\"] = df_new[\"country\"].map(country_codes)\n",
    "df_new = df_new.dropna()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "64e404dd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>latitude</th>\n",
       "      <th>longitude</th>\n",
       "      <th>country</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>658</th>\n",
       "      <td>658</td>\n",
       "      <td>45.190044</td>\n",
       "      <td>-93.460327</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>515</th>\n",
       "      <td>515</td>\n",
       "      <td>50.675779</td>\n",
       "      <td>65.306393</td>\n",
       "      <td>KZ</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2160</th>\n",
       "      <td>2160</td>\n",
       "      <td>-22.501809</td>\n",
       "      <td>16.604622</td>\n",
       "      <td>NA</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1280</th>\n",
       "      <td>1280</td>\n",
       "      <td>17.219689</td>\n",
       "      <td>54.973136</td>\n",
       "      <td>OM</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2178</th>\n",
       "      <td>2178</td>\n",
       "      <td>-1.269204</td>\n",
       "      <td>106.984503</td>\n",
       "      <td>ID</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2298</th>\n",
       "      <td>2298</td>\n",
       "      <td>29.857824</td>\n",
       "      <td>-98.468546</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>166</th>\n",
       "      <td>166</td>\n",
       "      <td>34.445968</td>\n",
       "      <td>-83.261908</td>\n",
       "      <td>US</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2200</th>\n",
       "      <td>2200</td>\n",
       "      <td>0.482515</td>\n",
       "      <td>-58.392550</td>\n",
       "      <td>BR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2376</th>\n",
       "      <td>2376</td>\n",
       "      <td>49.511296</td>\n",
       "      <td>93.285772</td>\n",
       "      <td>MN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1617</th>\n",
       "      <td>1617</td>\n",
       "      <td>44.357108</td>\n",
       "      <td>26.780326</td>\n",
       "      <td>RO</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      Unnamed: 0   latitude   longitude country\n",
       "658          658  45.190044  -93.460327      US\n",
       "515          515  50.675779   65.306393      KZ\n",
       "2160        2160 -22.501809   16.604622      NA\n",
       "1280        1280  17.219689   54.973136      OM\n",
       "2178        2178  -1.269204  106.984503      ID\n",
       "2298        2298  29.857824  -98.468546      US\n",
       "166          166  34.445968  -83.261908      US\n",
       "2200        2200   0.482515  -58.392550      BR\n",
       "2376        2376  49.511296   93.285772      MN\n",
       "1617        1617  44.357108   26.780326      RO"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_new.sample(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0f49be5",
   "metadata": {},
   "source": [
    "## Re-evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "a00e2d78",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model accuracy: 0.986764705882353\n"
     ]
    }
   ],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "\n",
    "# predict the country of the coordinates in the new dataframe\n",
    "predictions = knn.predict(df_new[[\"latitude\", \"longitude\"]])\n",
    "\n",
    "# calculate the accuracy of the model on the new dataframe\n",
    "accuracy = accuracy_score(df_new[\"country\"], predictions)\n",
    "print(f\"Model accuracy: {accuracy}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "58decb87",
   "metadata": {},
   "source": [
    "## Exploring misses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "11d44c9f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "      Unnamed: 0   latitude   longitude country predictions\n",
      "336          336 -18.241242   16.706105      NA          AO\n",
      "931          931 -25.104256   22.713657      BW          ZA\n",
      "958          958 -23.995538   17.280858      NA          ZA\n",
      "1348        1348  -9.672760 -138.965349      FR          PF\n",
      "1457        1457 -25.492231   16.244680      NA          ZA\n",
      "1674        1674  25.880065  -14.315396      MA          EH\n",
      "1991        1991  10.419924   23.470560      SD          CF\n",
      "1999        1999 -49.213029  -73.137265      AR          CL\n",
      "2160        2160 -22.501809   16.604622      NA          BW\n"
     ]
    }
   ],
   "source": [
    "df_new[\"predictions\"] = predictions\n",
    "print(df_new.loc[df_new[\"country\"] != df_new[\"predictions\"]])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}