{ "cells": [ { "cell_type": "markdown", "id": "b6a0ec93", "metadata": {}, "source": [ "## Prepare the data" ] }, { "cell_type": "code", "execution_count": 1, "id": "2a44b2be", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/3v/xcpv0w3x47d9mtww8gt5w_vh0000gn/T/ipykernel_37788/1821662786.py:4: DtypeWarning: Columns (9,10,11,12,13) have mixed types. Specify dtype option on import or set low_memory=False.\n", " data = pd.read_csv(\"allCountries.txt\", sep='\\t', header=None,\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " latitude longitude country\n", "0 42.64991 1.53335 AD\n", "1 42.58765 1.74028 AD\n", "2 42.61203 1.47364 AD\n", "3 42.52535 1.73343 AD\n", "4 42.52915 1.73362 AD\n" ] } ], "source": [ "import pandas as pd\n", "\n", "# read the data from the csv file\n", "data = pd.read_csv(\"allCountries.txt\", sep='\\t', header=None,\n", " names=[\"geonameid\",\"name\",\"asciiname\",\"alternatenames\",\"latitude\",\"longitude\",\"feature_class\",\"feature_code\",\"country\",\"cc2\",\"admin1_code\",\"admin2_code\",\"admin3_code\",\"admin4_code\",\"population\",\"elevation\",\"dem\",\"timezone\",\"modification_date\"])\n", "\n", "# select the columns of interest\n", "data = data[[\"latitude\",\"longitude\",\"country\"]]\n", "\n", "# display the first 5 rows of the DataFrame\n", "print(data.head())" ] }, { "cell_type": "code", "execution_count": 7, "id": "ee9987a5", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
latitudelongitudecountry
722680068.3239413.79753NO
207065731.2534292.09397CN
1185118333.98933-84.96254US
1112335534.16244-104.31050US
868803244.51929132.92021RU
\n", "
" ], "text/plain": [ " latitude longitude country\n", "7226800 68.32394 13.79753 NO\n", "2070657 31.25342 92.09397 CN\n", "11851183 33.98933 -84.96254 US\n", "11123355 34.16244 -104.31050 US\n", "8688032 44.51929 132.92021 RU" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.sample(5)" ] }, { "cell_type": "code", "execution_count": 2, "id": "a24a538d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(12363388, 3)" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data.shape" ] }, { "cell_type": "code", "execution_count": 3, "id": "13daf34e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "latitude 0\n", "longitude 0\n", "country 14551\n", "dtype: int64\n" ] } ], "source": [ "print(data.isnull().sum())" ] }, { "cell_type": "code", "execution_count": 4, "id": "afe982b3", "metadata": {}, "outputs": [], "source": [ "data = data.dropna()" ] }, { "cell_type": "markdown", "id": "a26b919e", "metadata": {}, "source": [ "## Train" ] }, { "cell_type": "code", "execution_count": 8, "id": "45b09303", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9970337294839029\n" ] } ], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.neighbors import KNeighborsClassifier\n", "\n", "# Split the data into training and test sets\n", "X = data[[\"latitude\", \"longitude\"]]\n", "y = data[\"country\"]\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", "\n", "# Create a KNN classifier with 5 nearest neighbors\n", "knn = KNeighborsClassifier(n_neighbors=4)\n", "\n", "# Fit the classifier to the training data\n", "knn.fit(X_train, y_train)\n", "\n", "# Print the accuracy of the model on the test data\n", "print(\"Accuracy:\", knn.score(X_test, y_test))\n" ] }, { "cell_type": "markdown", "id": "70a1290c", "metadata": {}, "source": [ "## Collect random data" ] }, { "cell_type": "code", "execution_count": 9, "id": "d45aee36", "metadata": {}, "outputs": [], "source": [ "API_KEY = 'YOUR_API_KEY'" ] }, { "cell_type": "code", "execution_count": 12, "id": "fbd11bf3", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2500/2500 [24:46<00:00, 1.68it/s]\n" ] } ], "source": [ "import random\n", "import requests\n", "from tqdm import tqdm\n", "\n", "# Set the number of coordinates to generate\n", "num_coordinates = 2500\n", "\n", "# Create an empty list to store the results\n", "results = []\n", "\n", "# Use tqdm to display progress\n", "for i in tqdm(range(num_coordinates)):\n", " # Generate a random latitude and longitude\n", " lat = random.uniform(-90, 90)\n", " lon = random.uniform(-180, 180)\n", "\n", " # Send a request to the geolocation API\n", " try:\n", " response = requests.get(f\"https://api.opencagedata.com/geocode/v1/json?q={lat}+{lon}&key={API_KEY}\")\n", " data = response.json()\n", " results.append((lat, lon, data['results'][0]['components']['country']))\n", " except:\n", " results.append((lat, lon, 'NA'))\n", "\n", "# Create a DataFrame from the results\n", "import pandas as pd\n", "df_new = pd.DataFrame(results, columns=[\"latitude\", \"longitude\", \"country\"])" ] }, { "cell_type": "code", "execution_count": 20, "id": "a347489a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " latitude longitude country\n", "80 -67.057900 151.629167 NA\n", "694 61.930620 -66.216010 Canada\n", "1667 7.481893 -155.699348 NA\n", "1669 -70.585375 -24.491804 NA\n", "2203 70.407657 28.774544 Norway\n" ] } ], "source": [ "df_new.to_csv('opencage_samples.csv')\n", "print(df_new.sample(5))" ] }, { "cell_type": "code", "execution_count": 53, "id": "f784dc35", "metadata": {}, "outputs": [], "source": [ "df_new = pd.read_csv('opencage_samples.csv')\n", "# Load the countryInfo.txt file\n", "with open(\"countryInfo.txt\") as f:\n", " country_data = f.readlines()\n", "\n", "# Create a dictionary to map country names to country codes\n", "country_codes = {}\n", "for line in country_data:\n", " if not line.startswith(\"#\"):\n", " data = line.strip().split(\"\\t\")\n", " country_codes[data[4]] = data[0]\n", "\n", "# Convert the country names to country codes in the new dataframe\n", "df_new[\"country\"] = df_new[\"country\"].map(country_codes)\n", "df_new = df_new.dropna()" ] }, { "cell_type": "code", "execution_count": 54, "id": "64e404dd", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0latitudelongitudecountry
65865845.190044-93.460327US
51551550.67577965.306393KZ
21602160-22.50180916.604622NA
1280128017.21968954.973136OM
21782178-1.269204106.984503ID
2298229829.857824-98.468546US
16616634.445968-83.261908US
220022000.482515-58.392550BR
2376237649.51129693.285772MN
1617161744.35710826.780326RO
\n", "
" ], "text/plain": [ " Unnamed: 0 latitude longitude country\n", "658 658 45.190044 -93.460327 US\n", "515 515 50.675779 65.306393 KZ\n", "2160 2160 -22.501809 16.604622 NA\n", "1280 1280 17.219689 54.973136 OM\n", "2178 2178 -1.269204 106.984503 ID\n", "2298 2298 29.857824 -98.468546 US\n", "166 166 34.445968 -83.261908 US\n", "2200 2200 0.482515 -58.392550 BR\n", "2376 2376 49.511296 93.285772 MN\n", "1617 1617 44.357108 26.780326 RO" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new.sample(10)" ] }, { "cell_type": "markdown", "id": "f0f49be5", "metadata": {}, "source": [ "## Re-evaluate the model" ] }, { "cell_type": "code", "execution_count": 55, "id": "a00e2d78", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model accuracy: 0.986764705882353\n" ] } ], "source": [ "from sklearn.metrics import accuracy_score\n", "\n", "# predict the country of the coordinates in the new dataframe\n", "predictions = knn.predict(df_new[[\"latitude\", \"longitude\"]])\n", "\n", "# calculate the accuracy of the model on the new dataframe\n", "accuracy = accuracy_score(df_new[\"country\"], predictions)\n", "print(f\"Model accuracy: {accuracy}\")" ] }, { "cell_type": "markdown", "id": "58decb87", "metadata": {}, "source": [ "## Exploring misses" ] }, { "cell_type": "code", "execution_count": 58, "id": "11d44c9f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Unnamed: 0 latitude longitude country predictions\n", "336 336 -18.241242 16.706105 NA AO\n", "931 931 -25.104256 22.713657 BW ZA\n", "958 958 -23.995538 17.280858 NA ZA\n", "1348 1348 -9.672760 -138.965349 FR PF\n", "1457 1457 -25.492231 16.244680 NA ZA\n", "1674 1674 25.880065 -14.315396 MA EH\n", "1991 1991 10.419924 23.470560 SD CF\n", "1999 1999 -49.213029 -73.137265 AR CL\n", "2160 2160 -22.501809 16.604622 NA BW\n" ] } ], "source": [ "df_new[\"predictions\"] = predictions\n", "print(df_new.loc[df_new[\"country\"] != df_new[\"predictions\"]])" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }