{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "data_prep_4_statistical_classification.ipynb",
      "provenance": [],
      "mount_file_id": "1jm1z6gO0bcqUbp40qsEej2Cwnz1GGOl8",
      "authorship_tag": "ABX9TyNYhY2UvXRVJ5I1fcLq+r/7",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/monsund/unsupervised-classification-of-Linkedin-Profiles-using-KMeans-LDA-TFIDF/blob/master/data_prep_for_statistical_classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "QXXLz4evdT1B"
      },
      "source": [
        "import pandas as pd"
      ],
      "execution_count": 1,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "0BICdlXvultO"
      },
      "source": [
        "input_datafile = '/content/drive/My Drive/NLP/with_swaraj/Data/lda_mallet_prediction/18_topics_pred_bigram_threshold_70.csv'\n",
        "df = pd.read_csv(input_datafile)"
      ],
      "execution_count": 2,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pM5-ufTmx-cK",
        "outputId": "f2634d27-bb9f-4e90-91bb-70fd38453eac",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "df.drop_duplicates(subset='about', keep=\"last\", inplace=True) # Removing rows with duplicate About description\n",
        "df.reset_index(drop=True, inplace=True)  # resetting index\n",
        "print(len(df))"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "8041\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "V-g4ffJ8uudx",
        "outputId": "6045d83b-3f43-4afb-cd24-6227dda61e96",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 204
        }
      },
      "source": [
        "df.head()"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>https://www.linkedin.com/in/lokesh-kumar-xess-...</td>\n",
              "      <td>lokesh-kumar-xess-54814068</td>\n",
              "      <td>Marketing &amp; Strategy at Mobistreak</td>\n",
              "      <td>I am a Google Adwords and Google Analytics cer...</td>\n",
              "      <td>5</td>\n",
              "      <td>0.184839</td>\n",
              "      <td>['marketing', 'digital', 'brand', 'medium', 'c...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>https://in.linkedin.com/in/sujithnarayanan</td>\n",
              "      <td>sujithnarayanan</td>\n",
              "      <td>Reimagining Financial Services for India</td>\n",
              "      <td>Ideate. Execute. Disrupt. Iterate.</td>\n",
              "      <td>12</td>\n",
              "      <td>0.066239</td>\n",
              "      <td>['business', 'sale', 'strategy', 'growth', 'ma...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>https://www.linkedin.com/in/adarshramakrishnan</td>\n",
              "      <td>adarshramakrishnan</td>\n",
              "      <td>Product Strategist | Product Manager | Triple ...</td>\n",
              "      <td>Over the past decade, I have co-founded 3 vent...</td>\n",
              "      <td>8</td>\n",
              "      <td>0.121083</td>\n",
              "      <td>['product', 'design', 'team', 'build', 'user',...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>https://www.linkedin.com/in/rhythm-bhatnagar-4...</td>\n",
              "      <td>rhythm-bhatnagar-4350b551</td>\n",
              "      <td>Product Marketer | Ex-SHEROES | Early-Stage St...</td>\n",
              "      <td>Hey,I am Rhythm. I am passionate about startup...</td>\n",
              "      <td>2</td>\n",
              "      <td>0.175537</td>\n",
              "      <td>['people', 'make', 'love', 'life', 'world', 't...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>https://www.linkedin.com/in/karthiksureshlbs</td>\n",
              "      <td>karthiksureshlbs</td>\n",
              "      <td>Product at Facebook | CMU MS | LBS MBA</td>\n",
              "      <td>I am a product-centric builder with experience...</td>\n",
              "      <td>8</td>\n",
              "      <td>0.174879</td>\n",
              "      <td>['product', 'design', 'team', 'build', 'user',...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                 url  ...                                           keywords\n",
              "0  https://www.linkedin.com/in/lokesh-kumar-xess-...  ...  ['marketing', 'digital', 'brand', 'medium', 'c...\n",
              "1         https://in.linkedin.com/in/sujithnarayanan  ...  ['business', 'sale', 'strategy', 'growth', 'ma...\n",
              "2     https://www.linkedin.com/in/adarshramakrishnan  ...  ['product', 'design', 'team', 'build', 'user',...\n",
              "3  https://www.linkedin.com/in/rhythm-bhatnagar-4...  ...  ['people', 'make', 'love', 'life', 'world', 't...\n",
              "4       https://www.linkedin.com/in/karthiksureshlbs  ...  ['product', 'design', 'team', 'build', 'user',...\n",
              "\n",
              "[5 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gEn9bIjvy2yz"
      },
      "source": [
        "# **Collecting Marketing Profiles**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "NM101_kOvBc1",
        "outputId": "10a6042b-660b-4a5f-8641-70d6dd3c04db",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_marketing = df[df['dominant_topic']==5]\n",
        "print(len(df_marketing))\n",
        "df_marketing.topic_prob.mean()"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "649\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.15014555646151115"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 5
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OYqbwB6hohc-"
      },
      "source": [
        "**Keeping data with 'topic_prob' greater than 0.16**\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8jXXZB0wwIx0",
        "outputId": "a7519072-7255-4c9c-ab35-70a1d051b7e6",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "df_marketing_ = df_marketing[df_marketing['topic_prob']>0.16]\n",
        "print(len(df_marketing_))\n"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "242\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "0gVVqOW-znnG"
      },
      "source": [
        "# **Collecting Product Leader's Profile**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TXaFKVuNxFOe",
        "outputId": "b80ba075-5f8d-4ab6-8415-dfbfd8b045bc",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_pm = df[df['dominant_topic']==8]\n",
        "print(len(df_pm))\n",
        "df_pm.topic_prob.mean()"
      ],
      "execution_count": 7,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "499\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.13460675273224404"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 7
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "y-6FEwUspZxT"
      },
      "source": [
        "**Keeping data with 'topic_prob' greater than 0.15**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EZDkspunz1Tu",
        "outputId": "0272ed75-9b44-4860-f345-6cd5a52b88de",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 159
        }
      },
      "source": [
        "df_pm_ = df_pm[df_pm['topic_prob']>0.15]\n",
        "print(len(df_pm_))\n",
        "df_pm_.head(3)"
      ],
      "execution_count": 8,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "156\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>https://www.linkedin.com/in/karthiksureshlbs</td>\n",
              "      <td>karthiksureshlbs</td>\n",
              "      <td>Product at Facebook | CMU MS | LBS MBA</td>\n",
              "      <td>I am a product-centric builder with experience...</td>\n",
              "      <td>8</td>\n",
              "      <td>0.174879</td>\n",
              "      <td>['product', 'design', 'team', 'build', 'user',...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>24</th>\n",
              "      <td>https://www.linkedin.com/in/talvindersingh</td>\n",
              "      <td>talvindersingh</td>\n",
              "      <td>pragmaticleaders.io - Become a Product Manager...</td>\n",
              "      <td>A committed and highly focused entrepreneur an...</td>\n",
              "      <td>8</td>\n",
              "      <td>0.236236</td>\n",
              "      <td>['product', 'design', 'team', 'build', 'user',...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>26</th>\n",
              "      <td>https://www.linkedin.com/in/atingarg29</td>\n",
              "      <td>atingarg29</td>\n",
              "      <td>PM at Telio</td>\n",
              "      <td>Love solving product-journey problems in the c...</td>\n",
              "      <td>8</td>\n",
              "      <td>0.156636</td>\n",
              "      <td>['product', 'design', 'team', 'build', 'user',...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                             url  ...                                           keywords\n",
              "4   https://www.linkedin.com/in/karthiksureshlbs  ...  ['product', 'design', 'team', 'build', 'user',...\n",
              "24    https://www.linkedin.com/in/talvindersingh  ...  ['product', 'design', 'team', 'build', 'user',...\n",
              "26        https://www.linkedin.com/in/atingarg29  ...  ['product', 'design', 'team', 'build', 'user',...\n",
              "\n",
              "[3 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "El4Hk-7W0zbY"
      },
      "source": [
        "# **Collecting Profiles with Specialization in Data**\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "9kwi0K1D0NB0",
        "outputId": "088656f0-5f7e-4f10-f850-8556b357d5e5",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_data = df[df['dominant_topic']==7]\n",
        "print(len(df_data))\n",
        "df_data.topic_prob.mean()"
      ],
      "execution_count": 9,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "438\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.14283705351053627"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GIcxPAbH17Xz",
        "outputId": "9f105733-7abc-4838-f56f-86ad98aed64d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 159
        }
      },
      "source": [
        "df_data_ = df_data[df_data['topic_prob']>0.146]\n",
        "print(len(df_data_))\n",
        "df_data_.head(3)"
      ],
      "execution_count": 10,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "163\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>196</th>\n",
              "      <td>https://www.linkedin.com/in/agogino</td>\n",
              "      <td>agogino</td>\n",
              "      <td>CEO and Chief Technology Officer at Squishy Ro...</td>\n",
              "      <td>development engineering; Intelligent learning ...</td>\n",
              "      <td>7</td>\n",
              "      <td>0.148594</td>\n",
              "      <td>['datum', 'solution', 'analytic', 'data', 'ent...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>227</th>\n",
              "      <td>https://www.linkedin.com/in/shashvat</td>\n",
              "      <td>shashvat</td>\n",
              "      <td>Investment Banking, Data Science &amp; Technology</td>\n",
              "      <td>• Expertise in Model Risk &amp; Governance, Algo T...</td>\n",
              "      <td>7</td>\n",
              "      <td>0.161616</td>\n",
              "      <td>['datum', 'solution', 'analytic', 'data', 'ent...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>254</th>\n",
              "      <td>https://www.linkedin.com/in/pradeep-javangula-...</td>\n",
              "      <td>pradeep-javangula-94a54</td>\n",
              "      <td>VP of Machine Learning &amp; Artificial Intelligen...</td>\n",
              "      <td>Product and engineering executive, innovator &amp;...</td>\n",
              "      <td>7</td>\n",
              "      <td>0.227937</td>\n",
              "      <td>['datum', 'solution', 'analytic', 'data', 'ent...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                   url  ...                                           keywords\n",
              "196                https://www.linkedin.com/in/agogino  ...  ['datum', 'solution', 'analytic', 'data', 'ent...\n",
              "227               https://www.linkedin.com/in/shashvat  ...  ['datum', 'solution', 'analytic', 'data', 'ent...\n",
              "254  https://www.linkedin.com/in/pradeep-javangula-...  ...  ['datum', 'solution', 'analytic', 'data', 'ent...\n",
              "\n",
              "[3 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 10
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ho6FoktL4hKY"
      },
      "source": [
        "## **Profiles of Software Developer and Software related**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "f9Ydj1uU1e9N",
        "outputId": "b918e719-d122-4000-8057-cfb8a98b0159",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_sw = df[df['dominant_topic']==10]\n",
        "print(len(df_sw))\n",
        "df_sw.topic_prob.mean()"
      ],
      "execution_count": 11,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "638\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.1565606411603038"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 11
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rxB1J2xe5PD8",
        "outputId": "97ac9fe3-ce9b-49b5-fc86-3e0d02f9f001",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 128
        }
      },
      "source": [
        "df_sw_ = df_sw[df_sw['topic_prob']>.17]\n",
        "print(len(df_sw_))\n",
        "df_sw_.head(2)"
      ],
      "execution_count": 12,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "208\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>40</th>\n",
              "      <td>https://www.linkedin.com/in/nakulsaxena</td>\n",
              "      <td>nakulsaxena</td>\n",
              "      <td>Senior SAP Consultant at Saxena Consulting</td>\n",
              "      <td>Nakul has a Masters in Business Administration...</td>\n",
              "      <td>10</td>\n",
              "      <td>0.309524</td>\n",
              "      <td>['software', 'application', 'web', 'system', '...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>64</th>\n",
              "      <td>https://www.linkedin.com/in/ashish-tripathi-aa...</td>\n",
              "      <td>ashish-tripathi-aa5788b</td>\n",
              "      <td>Solving Problems with Technology | Startup Ori...</td>\n",
              "      <td>• 13 years of experience of IT consulting arou...</td>\n",
              "      <td>10</td>\n",
              "      <td>0.236111</td>\n",
              "      <td>['software', 'application', 'web', 'system', '...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                  url  ...                                           keywords\n",
              "40            https://www.linkedin.com/in/nakulsaxena  ...  ['software', 'application', 'web', 'system', '...\n",
              "64  https://www.linkedin.com/in/ashish-tripathi-aa...  ...  ['software', 'application', 'web', 'system', '...\n",
              "\n",
              "[2 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 12
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iulM_o3GYSxn"
      },
      "source": [
        "# **Profiles with StartUp, entrepreneur, investor with interest in startups**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "GKYPwVMxXtAY",
        "outputId": "2b8a8355-0dee-4c41-d4c0-d2084a7aa05e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_entrepreneur = df[df['dominant_topic']==14]\n",
        "print(len(df_entrepreneur))\n",
        "df_entrepreneur.topic_prob.mean()"
      ],
      "execution_count": 13,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "449\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.11801813307142091"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 13
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XLaJRj39Yxl_",
        "outputId": "18a39144-7a1b-4653-f46d-7725feb3cb7d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 221
        }
      },
      "source": [
        "df_entrepreneur_ = df_entrepreneur[df_entrepreneur['topic_prob']>0.137]\n",
        "print(len(df_entrepreneur_))\n",
        "df_entrepreneur_.head()"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "117\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>https://www.linkedin.com/in/nikhilguptadigital/</td>\n",
              "      <td>nikhilguptadigital</td>\n",
              "      <td>Creator Monetization  Zebra IQ</td>\n",
              "      <td>☀️ Dynamic product leader with a track record ...</td>\n",
              "      <td>14</td>\n",
              "      <td>0.164751</td>\n",
              "      <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>19</th>\n",
              "      <td>https://in.linkedin.com/in/nshntdxt</td>\n",
              "      <td>nshntdxt</td>\n",
              "      <td>Engineering Leader at Slack</td>\n",
              "      <td>Building high functioning agile teams, one ste...</td>\n",
              "      <td>14</td>\n",
              "      <td>0.152455</td>\n",
              "      <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>63</th>\n",
              "      <td>https://www.linkedin.com/in/pratikpoddar</td>\n",
              "      <td>pratikpoddar</td>\n",
              "      <td>Principal at Nexus Venture Partners</td>\n",
              "      <td>I am a venture capitalist looking to work with...</td>\n",
              "      <td>14</td>\n",
              "      <td>0.153305</td>\n",
              "      <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>100</th>\n",
              "      <td>https://www.linkedin.com/in/prasannainindia</td>\n",
              "      <td>prasannainindia</td>\n",
              "      <td>Helping founders build Value SaaS startups | O...</td>\n",
              "      <td>At Upekkha, we work with B2B SaaS startups to ...</td>\n",
              "      <td>14</td>\n",
              "      <td>0.137725</td>\n",
              "      <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>115</th>\n",
              "      <td>https://www.linkedin.com/in/rahulguptarmv</td>\n",
              "      <td>rahulguptarmv</td>\n",
              "      <td>Helping early stage startups with growth, fund...</td>\n",
              "      <td>Rahul has a decade of operational experience, ...</td>\n",
              "      <td>14</td>\n",
              "      <td>0.144676</td>\n",
              "      <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                 url  ...                                           keywords\n",
              "7    https://www.linkedin.com/in/nikhilguptadigital/  ...  ['startup', 'tech', 'build', 'start', 'entrepr...\n",
              "19               https://in.linkedin.com/in/nshntdxt  ...  ['startup', 'tech', 'build', 'start', 'entrepr...\n",
              "63          https://www.linkedin.com/in/pratikpoddar  ...  ['startup', 'tech', 'build', 'start', 'entrepr...\n",
              "100      https://www.linkedin.com/in/prasannainindia  ...  ['startup', 'tech', 'build', 'start', 'entrepr...\n",
              "115        https://www.linkedin.com/in/rahulguptarmv  ...  ['startup', 'tech', 'build', 'start', 'entrepr...\n",
              "\n",
              "[5 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 14
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "vXpLoQloaEVe"
      },
      "source": [
        "# **profiles of working in Invesment company or startups looking for investment**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "8ANlLV67Zk60",
        "outputId": "06d309e9-0096-4af0-c779-a78216a63b74",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_investor = df[df['dominant_topic']==11]\n",
        "print(len(df_investor))\n",
        "df_investor.topic_prob.mean()"
      ],
      "execution_count": 15,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "569\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.15687370162600825"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 15
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "EOQC5R_ea41P",
        "outputId": "efbecd8a-4ee4-4688-ce50-4de5610d3f56",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 159
        }
      },
      "source": [
        "df_investor_ = df_investor[df_investor['topic_prob']>0.19]\n",
        "print(len(df_investor_))\n",
        "df_investor_.head(3)"
      ],
      "execution_count": 16,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "136\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>https://www.linkedin.com/in/karthikprabhakar</td>\n",
              "      <td>karthikprabhakar</td>\n",
              "      <td>Executive Director and Partner at Chiratae Ven...</td>\n",
              "      <td>Karthik Prabhakar is the Executive Director an...</td>\n",
              "      <td>11</td>\n",
              "      <td>0.208158</td>\n",
              "      <td>['company', 'investment', 'lead', 'venture', '...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>16</th>\n",
              "      <td>https://www.linkedin.com/in/vishyvenugopalan</td>\n",
              "      <td>vishyvenugopalan</td>\n",
              "      <td>Senior Vice President  Citi Ventures | AI, dat...</td>\n",
              "      <td>Vishy is on the venture investment team at Cit...</td>\n",
              "      <td>11</td>\n",
              "      <td>0.210210</td>\n",
              "      <td>['company', 'investment', 'lead', 'venture', '...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>33</th>\n",
              "      <td>https://www.linkedin.com/in/sanatrao</td>\n",
              "      <td>sanatrao</td>\n",
              "      <td>Managing Partner at Shyn Capital</td>\n",
              "      <td>Sanat Rao is a venture capital investor and M&amp;...</td>\n",
              "      <td>11</td>\n",
              "      <td>0.279412</td>\n",
              "      <td>['company', 'investment', 'lead', 'venture', '...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                             url  ...                                           keywords\n",
              "6   https://www.linkedin.com/in/karthikprabhakar  ...  ['company', 'investment', 'lead', 'venture', '...\n",
              "16  https://www.linkedin.com/in/vishyvenugopalan  ...  ['company', 'investment', 'lead', 'venture', '...\n",
              "33          https://www.linkedin.com/in/sanatrao  ...  ['company', 'investment', 'lead', 'venture', '...\n",
              "\n",
              "[3 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 16
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "s_ntRM9scrsc"
      },
      "source": [
        "# **Business Developer, Sales, Strategic growth developer**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "37_dQ84jbdHQ",
        "outputId": "f69876fe-3d83-41be-f346-bf9fdb020f62",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_business = df[df['dominant_topic']==12]\n",
        "print(len(df_business))\n",
        "df_business.topic_prob.mean()"
      ],
      "execution_count": 17,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "405\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.12857702210846964"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 17
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Z5JTnKxEc7yU",
        "outputId": "a8dcdb25-cc8d-4cd8-970e-8dc02b31a95f",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 128
        }
      },
      "source": [
        "df_business_ = df_business[df_business['topic_prob']>0.14]\n",
        "print(len(df_business_))\n",
        "df_business_.head(2)"
      ],
      "execution_count": 18,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "127\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>21</th>\n",
              "      <td>https://www.linkedin.com/in/rsudhirshenoy</td>\n",
              "      <td>rsudhirshenoy</td>\n",
              "      <td>Product Management, Marketing and Channel Mana...</td>\n",
              "      <td>Experienced Professional in the Technology Ind...</td>\n",
              "      <td>12</td>\n",
              "      <td>0.143474</td>\n",
              "      <td>['business', 'sale', 'strategy', 'growth', 'ma...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>29</th>\n",
              "      <td>https://in.linkedin.com/in/rishi-srivastava-12...</td>\n",
              "      <td>rishi-srivastava-120b005</td>\n",
              "      <td>Founder AARK TechAccess, General Partner BOV C...</td>\n",
              "      <td>Executive with 26+ years of international expe...</td>\n",
              "      <td>12</td>\n",
              "      <td>0.154725</td>\n",
              "      <td>['business', 'sale', 'strategy', 'growth', 'ma...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                  url  ...                                           keywords\n",
              "21          https://www.linkedin.com/in/rsudhirshenoy  ...  ['business', 'sale', 'strategy', 'growth', 'ma...\n",
              "29  https://in.linkedin.com/in/rishi-srivastava-12...  ...  ['business', 'sale', 'strategy', 'growth', 'ma...\n",
              "\n",
              "[2 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 18
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "FfRLtqhugMqA"
      },
      "source": [
        "# **Management skills Profile**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xhrdMExGfpPH",
        "outputId": "ec589ab1-eb9c-4bf7-95bb-c0a14cb14b23",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 51
        }
      },
      "source": [
        "df_manager = df[df['dominant_topic']==3]\n",
        "print(len(df_manager))\n",
        "df_manager.topic_prob.mean()"
      ],
      "execution_count": 19,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "326\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "0.13591641734432516"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 19
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "spsGGNqRgie8",
        "outputId": "8c41777a-84be-4258-e939-2b0d6dc5481d",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 128
        }
      },
      "source": [
        "df_manager_ = df_manager[df_manager['topic_prob']>0.14]\n",
        "print(len(df_manager_))\n",
        "df_manager_.head(2)"
      ],
      "execution_count": 20,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "131\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>96</th>\n",
              "      <td>https://www.linkedin.com/in/kievewang</td>\n",
              "      <td>kievewang</td>\n",
              "      <td>Sr. Executive Assistant</td>\n",
              "      <td>Skills: Customer Service, Technical/Customer S...</td>\n",
              "      <td>3</td>\n",
              "      <td>0.181637</td>\n",
              "      <td>['management', 'customer', 'manage', 'process'...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>228</th>\n",
              "      <td>https://www.linkedin.com/in/imnikhilbhaskaran/</td>\n",
              "      <td>imnikhilbhaskaran</td>\n",
              "      <td>Associate Director, Customer Success at Icertis</td>\n",
              "      <td>I am passionate about delivering world class T...</td>\n",
              "      <td>3</td>\n",
              "      <td>0.156695</td>\n",
              "      <td>['management', 'customer', 'manage', 'process'...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                url  ...                                           keywords\n",
              "96            https://www.linkedin.com/in/kievewang  ...  ['management', 'customer', 'manage', 'process'...\n",
              "228  https://www.linkedin.com/in/imnikhilbhaskaran/  ...  ['management', 'customer', 'manage', 'process'...\n",
              "\n",
              "[2 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 20
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "EiJec0rlIoa7"
      },
      "source": [
        "# **HR Profiles**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "3vVZRFWag9HS",
        "outputId": "074ecbc4-3428-441c-a086-e69c74ba3719",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 162
        }
      },
      "source": [
        "df_hr = df[df['dominant_topic']==4]\n",
        "print(len(df_hr))\n",
        "df_hr.head(2)"
      ],
      "execution_count": 21,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "309\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>url</th>\n",
              "      <th>username</th>\n",
              "      <th>title</th>\n",
              "      <th>about</th>\n",
              "      <th>dominant_topic</th>\n",
              "      <th>topic_prob</th>\n",
              "      <th>keywords</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>178</th>\n",
              "      <td>https://www.linkedin.com/in/jonathan-parisot-1...</td>\n",
              "      <td>jonathan-parisot-10054625</td>\n",
              "      <td>Co-Founder &amp; CEO at Actiondesk (YC S19)</td>\n",
              "      <td>Building Actiondesk, a spreadsheet software na...</td>\n",
              "      <td>4</td>\n",
              "      <td>0.087037</td>\n",
              "      <td>['team', 'build', 'lead', 'leadership', 'organ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>298</th>\n",
              "      <td>https://www.linkedin.com/in/meghan-herring-049...</td>\n",
              "      <td>meghan-herring-0491635</td>\n",
              "      <td>Senior Technical Recruiter</td>\n",
              "      <td>SENIOR TECHNICAL RECRUITER / HUMAN RESOURCES L...</td>\n",
              "      <td>4</td>\n",
              "      <td>0.140461</td>\n",
              "      <td>['team', 'build', 'lead', 'leadership', 'organ...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                                   url  ...                                           keywords\n",
              "178  https://www.linkedin.com/in/jonathan-parisot-1...  ...  ['team', 'build', 'lead', 'leadership', 'organ...\n",
              "298  https://www.linkedin.com/in/meghan-herring-049...  ...  ['team', 'build', 'lead', 'leadership', 'organ...\n",
              "\n",
              "[2 rows x 7 columns]"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 21
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ppfPSV0ZU5gI"
      },
      "source": [
        "**Marking rows for about section description with keywords of HR**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "TKB9De2SJU9_",
        "outputId": "4c824920-24b8-46f6-b71d-1d781f8856a9",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 119
        }
      },
      "source": [
        "hr_word = set(['hr ', ' hr' 'human resource', 'recruit'])\n",
        "df_hr['checking_hr_word'] = df_hr.about.apply(lambda x:  ['true' for word in hr_word if word in x.lower() ])\n",
        "# df_hr.head(2)"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
            "A value is trying to be set on a copy of a slice from a DataFrame.\n",
            "Try using .loc[row_indexer,col_indexer] = value instead\n",
            "\n",
            "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
            "  \n"
          ],
          "name": "stderr"
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "OoJYNcQ_VQqT"
      },
      "source": [
        "**collecting dataframe row_index where an about section has HR related words**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xq_yj5BQKHc3"
      },
      "source": [
        "row_index_with_hr_words = []\n",
        "for row in df_hr.index:\n",
        "  if len(df_hr['checking_hr_word'][row])!=0:\n",
        "    row_index_with_hr_words.append(row)\n",
        "    # print(row,'\\n')"
      ],
      "execution_count": 23,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gxcBB06-VrIm"
      },
      "source": [
        "**Collecting final Rows with HR related about section**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Gbumj-5cM6WW"
      },
      "source": [
        "final_df_hr_list = []\n",
        "for index in row_index_with_hr_words:\n",
        "  url = df_hr.url[index]\n",
        "  username = df_hr.username[index]\n",
        "  title = df_hr.title[index]\n",
        "  about = df_hr.about[index]\n",
        "  dominant_topic = df_hr.dominant_topic[index]\n",
        "  topic_prob = df_hr.topic_prob[index]\n",
        "  keywords = df_hr.keywords[index]\n",
        "  dic = {'url':url,'username':username,'title':title,'about':about,'dominant_topic':dominant_topic,'topic_prob':topic_prob,'keywords':keywords}\n",
        "  final_df_hr_list.append(dic)\n",
        "  \n",
        "df_hr_ = pd.DataFrame(final_df_hr_list)"
      ],
      "execution_count": 24,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "uAEIOZ8vSyf-",
        "outputId": "a2968841-fab5-410d-f1d5-83dbbc0f4699",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "df_hr_ = df_hr_[df_hr_['topic_prob']>0.15]\n",
        "len(df_hr_)\n"
      ],
      "execution_count": 25,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "85"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 25
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "P7XRsRqbXSPJ"
      },
      "source": [
        "# **Final Data Preparation for Statistical Model Classification**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "wgxy8cf-XcKz"
      },
      "source": [
        "df_final_data = pd.concat([df_marketing_,df_pm_,df_data_,df_sw_, df_entrepreneur_, df_investor_,df_manager_,df_business_,df_hr_],axis=0)"
      ],
      "execution_count": 26,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "DpeZ3nGqYXRF",
        "outputId": "ab0e544d-d912-4c3e-d45a-af4538a1828c",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        }
      },
      "source": [
        "len(df_final_data)"
      ],
      "execution_count": 27,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "1365"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 27
        }
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "sm9j-OXtcPWx"
      },
      "source": [
        "# **Export final data frame**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "YII0kg0Hcs3W"
      },
      "source": [
        "df_final_data.to_csv('/content/drive/My Drive/NLP/with_swaraj/Data/data_for_statistical_classification_1.csv')"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "yZywI1nndJes"
      },
      "source": [
        ""
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}