{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "data_prep_4_statistical_classification.ipynb", "provenance": [], "mount_file_id": "1jm1z6gO0bcqUbp40qsEej2Cwnz1GGOl8", "authorship_tag": "ABX9TyNYhY2UvXRVJ5I1fcLq+r/7", "include_colab_link": true }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "markdown", "metadata": { "id": "view-in-github", "colab_type": "text" }, "source": [ "<a href=\"https://colab.research.google.com/github/monsund/unsupervised-classification-of-Linkedin-Profiles-using-KMeans-LDA-TFIDF/blob/master/data_prep_for_statistical_classification.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" ] }, { "cell_type": "code", "metadata": { "id": "QXXLz4evdT1B" }, "source": [ "import pandas as pd" ], "execution_count": 1, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "0BICdlXvultO" }, "source": [ "input_datafile = '/content/drive/My Drive/NLP/with_swaraj/Data/lda_mallet_prediction/18_topics_pred_bigram_threshold_70.csv'\n", "df = pd.read_csv(input_datafile)" ], "execution_count": 2, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pM5-ufTmx-cK", "outputId": "f2634d27-bb9f-4e90-91bb-70fd38453eac", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "df.drop_duplicates(subset='about', keep=\"last\", inplace=True) # Removing rows with duplicate About description\n", "df.reset_index(drop=True, inplace=True) # resetting index\n", "print(len(df))" ], "execution_count": 3, "outputs": [ { "output_type": "stream", "text": [ "8041\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "V-g4ffJ8uudx", "outputId": "6045d83b-3f43-4afb-cd24-6227dda61e96", "colab": { "base_uri": "https://localhost:8080/", "height": 204 } }, "source": [ "df.head()" ], "execution_count": 4, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>0</th>\n", " <td>https://www.linkedin.com/in/lokesh-kumar-xess-...</td>\n", " <td>lokesh-kumar-xess-54814068</td>\n", " <td>Marketing & Strategy at Mobistreak</td>\n", " <td>I am a Google Adwords and Google Analytics cer...</td>\n", " <td>5</td>\n", " <td>0.184839</td>\n", " <td>['marketing', 'digital', 'brand', 'medium', 'c...</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>https://in.linkedin.com/in/sujithnarayanan</td>\n", " <td>sujithnarayanan</td>\n", " <td>Reimagining Financial Services for India</td>\n", " <td>Ideate. Execute. Disrupt. Iterate.</td>\n", " <td>12</td>\n", " <td>0.066239</td>\n", " <td>['business', 'sale', 'strategy', 'growth', 'ma...</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", " <td>https://www.linkedin.com/in/adarshramakrishnan</td>\n", " <td>adarshramakrishnan</td>\n", " <td>Product Strategist | Product Manager | Triple ...</td>\n", " <td>Over the past decade, I have co-founded 3 vent...</td>\n", " <td>8</td>\n", " <td>0.121083</td>\n", " <td>['product', 'design', 'team', 'build', 'user',...</td>\n", " </tr>\n", " <tr>\n", " <th>3</th>\n", " <td>https://www.linkedin.com/in/rhythm-bhatnagar-4...</td>\n", " <td>rhythm-bhatnagar-4350b551</td>\n", " <td>Product Marketer | Ex-SHEROES | Early-Stage St...</td>\n", " <td>Hey,I am Rhythm. I am passionate about startup...</td>\n", " <td>2</td>\n", " <td>0.175537</td>\n", " <td>['people', 'make', 'love', 'life', 'world', 't...</td>\n", " </tr>\n", " <tr>\n", " <th>4</th>\n", " <td>https://www.linkedin.com/in/karthiksureshlbs</td>\n", " <td>karthiksureshlbs</td>\n", " <td>Product at Facebook | CMU MS | LBS MBA</td>\n", " <td>I am a product-centric builder with experience...</td>\n", " <td>8</td>\n", " <td>0.174879</td>\n", " <td>['product', 'design', 'team', 'build', 'user',...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "0 https://www.linkedin.com/in/lokesh-kumar-xess-... ... ['marketing', 'digital', 'brand', 'medium', 'c...\n", "1 https://in.linkedin.com/in/sujithnarayanan ... ['business', 'sale', 'strategy', 'growth', 'ma...\n", "2 https://www.linkedin.com/in/adarshramakrishnan ... ['product', 'design', 'team', 'build', 'user',...\n", "3 https://www.linkedin.com/in/rhythm-bhatnagar-4... ... ['people', 'make', 'love', 'life', 'world', 't...\n", "4 https://www.linkedin.com/in/karthiksureshlbs ... ['product', 'design', 'team', 'build', 'user',...\n", "\n", "[5 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "markdown", "metadata": { "id": "gEn9bIjvy2yz" }, "source": [ "# **Collecting Marketing Profiles**" ] }, { "cell_type": "code", "metadata": { "id": "NM101_kOvBc1", "outputId": "10a6042b-660b-4a5f-8641-70d6dd3c04db", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_marketing = df[df['dominant_topic']==5]\n", "print(len(df_marketing))\n", "df_marketing.topic_prob.mean()" ], "execution_count": 5, "outputs": [ { "output_type": "stream", "text": [ "649\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.15014555646151115" ] }, "metadata": { "tags": [] }, "execution_count": 5 } ] }, { "cell_type": "markdown", "metadata": { "id": "OYqbwB6hohc-" }, "source": [ "**Keeping data with 'topic_prob' greater than 0.16**\n" ] }, { "cell_type": "code", "metadata": { "id": "8jXXZB0wwIx0", "outputId": "a7519072-7255-4c9c-ab35-70a1d051b7e6", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "df_marketing_ = df_marketing[df_marketing['topic_prob']>0.16]\n", "print(len(df_marketing_))\n" ], "execution_count": 6, "outputs": [ { "output_type": "stream", "text": [ "242\n" ], "name": "stdout" } ] }, { "cell_type": "markdown", "metadata": { "id": "0gVVqOW-znnG" }, "source": [ "# **Collecting Product Leader's Profile**" ] }, { "cell_type": "code", "metadata": { "id": "TXaFKVuNxFOe", "outputId": "b80ba075-5f8d-4ab6-8415-dfbfd8b045bc", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_pm = df[df['dominant_topic']==8]\n", "print(len(df_pm))\n", "df_pm.topic_prob.mean()" ], "execution_count": 7, "outputs": [ { "output_type": "stream", "text": [ "499\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.13460675273224404" ] }, "metadata": { "tags": [] }, "execution_count": 7 } ] }, { "cell_type": "markdown", "metadata": { "id": "y-6FEwUspZxT" }, "source": [ "**Keeping data with 'topic_prob' greater than 0.15**" ] }, { "cell_type": "code", "metadata": { "id": "EZDkspunz1Tu", "outputId": "0272ed75-9b44-4860-f345-6cd5a52b88de", "colab": { "base_uri": "https://localhost:8080/", "height": 159 } }, "source": [ "df_pm_ = df_pm[df_pm['topic_prob']>0.15]\n", "print(len(df_pm_))\n", "df_pm_.head(3)" ], "execution_count": 8, "outputs": [ { "output_type": "stream", "text": [ "156\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>4</th>\n", " <td>https://www.linkedin.com/in/karthiksureshlbs</td>\n", " <td>karthiksureshlbs</td>\n", " <td>Product at Facebook | CMU MS | LBS MBA</td>\n", " <td>I am a product-centric builder with experience...</td>\n", " <td>8</td>\n", " <td>0.174879</td>\n", " <td>['product', 'design', 'team', 'build', 'user',...</td>\n", " </tr>\n", " <tr>\n", " <th>24</th>\n", " <td>https://www.linkedin.com/in/talvindersingh</td>\n", " <td>talvindersingh</td>\n", " <td>pragmaticleaders.io - Become a Product Manager...</td>\n", " <td>A committed and highly focused entrepreneur an...</td>\n", " <td>8</td>\n", " <td>0.236236</td>\n", " <td>['product', 'design', 'team', 'build', 'user',...</td>\n", " </tr>\n", " <tr>\n", " <th>26</th>\n", " <td>https://www.linkedin.com/in/atingarg29</td>\n", " <td>atingarg29</td>\n", " <td>PM at Telio</td>\n", " <td>Love solving product-journey problems in the c...</td>\n", " <td>8</td>\n", " <td>0.156636</td>\n", " <td>['product', 'design', 'team', 'build', 'user',...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "4 https://www.linkedin.com/in/karthiksureshlbs ... ['product', 'design', 'team', 'build', 'user',...\n", "24 https://www.linkedin.com/in/talvindersingh ... ['product', 'design', 'team', 'build', 'user',...\n", "26 https://www.linkedin.com/in/atingarg29 ... ['product', 'design', 'team', 'build', 'user',...\n", "\n", "[3 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "cell_type": "markdown", "metadata": { "id": "El4Hk-7W0zbY" }, "source": [ "# **Collecting Profiles with Specialization in Data**\n" ] }, { "cell_type": "code", "metadata": { "id": "9kwi0K1D0NB0", "outputId": "088656f0-5f7e-4f10-f850-8556b357d5e5", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_data = df[df['dominant_topic']==7]\n", "print(len(df_data))\n", "df_data.topic_prob.mean()" ], "execution_count": 9, "outputs": [ { "output_type": "stream", "text": [ "438\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.14283705351053627" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "GIcxPAbH17Xz", "outputId": "9f105733-7abc-4838-f56f-86ad98aed64d", "colab": { "base_uri": "https://localhost:8080/", "height": 159 } }, "source": [ "df_data_ = df_data[df_data['topic_prob']>0.146]\n", "print(len(df_data_))\n", "df_data_.head(3)" ], "execution_count": 10, "outputs": [ { "output_type": "stream", "text": [ "163\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>196</th>\n", " <td>https://www.linkedin.com/in/agogino</td>\n", " <td>agogino</td>\n", " <td>CEO and Chief Technology Officer at Squishy Ro...</td>\n", " <td>development engineering; Intelligent learning ...</td>\n", " <td>7</td>\n", " <td>0.148594</td>\n", " <td>['datum', 'solution', 'analytic', 'data', 'ent...</td>\n", " </tr>\n", " <tr>\n", " <th>227</th>\n", " <td>https://www.linkedin.com/in/shashvat</td>\n", " <td>shashvat</td>\n", " <td>Investment Banking, Data Science & Technology</td>\n", " <td>• Expertise in Model Risk & Governance, Algo T...</td>\n", " <td>7</td>\n", " <td>0.161616</td>\n", " <td>['datum', 'solution', 'analytic', 'data', 'ent...</td>\n", " </tr>\n", " <tr>\n", " <th>254</th>\n", " <td>https://www.linkedin.com/in/pradeep-javangula-...</td>\n", " <td>pradeep-javangula-94a54</td>\n", " <td>VP of Machine Learning & Artificial Intelligen...</td>\n", " <td>Product and engineering executive, innovator &...</td>\n", " <td>7</td>\n", " <td>0.227937</td>\n", " <td>['datum', 'solution', 'analytic', 'data', 'ent...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "196 https://www.linkedin.com/in/agogino ... ['datum', 'solution', 'analytic', 'data', 'ent...\n", "227 https://www.linkedin.com/in/shashvat ... ['datum', 'solution', 'analytic', 'data', 'ent...\n", "254 https://www.linkedin.com/in/pradeep-javangula-... ... ['datum', 'solution', 'analytic', 'data', 'ent...\n", "\n", "[3 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 10 } ] }, { "cell_type": "markdown", "metadata": { "id": "ho6FoktL4hKY" }, "source": [ "## **Profiles of Software Developer and Software related**" ] }, { "cell_type": "code", "metadata": { "id": "f9Ydj1uU1e9N", "outputId": "b918e719-d122-4000-8057-cfb8a98b0159", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_sw = df[df['dominant_topic']==10]\n", "print(len(df_sw))\n", "df_sw.topic_prob.mean()" ], "execution_count": 11, "outputs": [ { "output_type": "stream", "text": [ "638\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.1565606411603038" ] }, "metadata": { "tags": [] }, "execution_count": 11 } ] }, { "cell_type": "code", "metadata": { "id": "rxB1J2xe5PD8", "outputId": "97ac9fe3-ce9b-49b5-fc86-3e0d02f9f001", "colab": { "base_uri": "https://localhost:8080/", "height": 128 } }, "source": [ "df_sw_ = df_sw[df_sw['topic_prob']>.17]\n", "print(len(df_sw_))\n", "df_sw_.head(2)" ], "execution_count": 12, "outputs": [ { "output_type": "stream", "text": [ "208\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>40</th>\n", " <td>https://www.linkedin.com/in/nakulsaxena</td>\n", " <td>nakulsaxena</td>\n", " <td>Senior SAP Consultant at Saxena Consulting</td>\n", " <td>Nakul has a Masters in Business Administration...</td>\n", " <td>10</td>\n", " <td>0.309524</td>\n", " <td>['software', 'application', 'web', 'system', '...</td>\n", " </tr>\n", " <tr>\n", " <th>64</th>\n", " <td>https://www.linkedin.com/in/ashish-tripathi-aa...</td>\n", " <td>ashish-tripathi-aa5788b</td>\n", " <td>Solving Problems with Technology | Startup Ori...</td>\n", " <td>• 13 years of experience of IT consulting arou...</td>\n", " <td>10</td>\n", " <td>0.236111</td>\n", " <td>['software', 'application', 'web', 'system', '...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "40 https://www.linkedin.com/in/nakulsaxena ... ['software', 'application', 'web', 'system', '...\n", "64 https://www.linkedin.com/in/ashish-tripathi-aa... ... ['software', 'application', 'web', 'system', '...\n", "\n", "[2 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 12 } ] }, { "cell_type": "markdown", "metadata": { "id": "iulM_o3GYSxn" }, "source": [ "# **Profiles with StartUp, entrepreneur, investor with interest in startups**" ] }, { "cell_type": "code", "metadata": { "id": "GKYPwVMxXtAY", "outputId": "2b8a8355-0dee-4c41-d4c0-d2084a7aa05e", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_entrepreneur = df[df['dominant_topic']==14]\n", "print(len(df_entrepreneur))\n", "df_entrepreneur.topic_prob.mean()" ], "execution_count": 13, "outputs": [ { "output_type": "stream", "text": [ "449\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.11801813307142091" ] }, "metadata": { "tags": [] }, "execution_count": 13 } ] }, { "cell_type": "code", "metadata": { "id": "XLaJRj39Yxl_", "outputId": "18a39144-7a1b-4653-f46d-7725feb3cb7d", "colab": { "base_uri": "https://localhost:8080/", "height": 221 } }, "source": [ "df_entrepreneur_ = df_entrepreneur[df_entrepreneur['topic_prob']>0.137]\n", "print(len(df_entrepreneur_))\n", "df_entrepreneur_.head()" ], "execution_count": 14, "outputs": [ { "output_type": "stream", "text": [ "117\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>7</th>\n", " <td>https://www.linkedin.com/in/nikhilguptadigital/</td>\n", " <td>nikhilguptadigital</td>\n", " <td>Creator Monetization Zebra IQ</td>\n", " <td>☀️ Dynamic product leader with a track record ...</td>\n", " <td>14</td>\n", " <td>0.164751</td>\n", " <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n", " </tr>\n", " <tr>\n", " <th>19</th>\n", " <td>https://in.linkedin.com/in/nshntdxt</td>\n", " <td>nshntdxt</td>\n", " <td>Engineering Leader at Slack</td>\n", " <td>Building high functioning agile teams, one ste...</td>\n", " <td>14</td>\n", " <td>0.152455</td>\n", " <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n", " </tr>\n", " <tr>\n", " <th>63</th>\n", " <td>https://www.linkedin.com/in/pratikpoddar</td>\n", " <td>pratikpoddar</td>\n", " <td>Principal at Nexus Venture Partners</td>\n", " <td>I am a venture capitalist looking to work with...</td>\n", " <td>14</td>\n", " <td>0.153305</td>\n", " <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n", " </tr>\n", " <tr>\n", " <th>100</th>\n", " <td>https://www.linkedin.com/in/prasannainindia</td>\n", " <td>prasannainindia</td>\n", " <td>Helping founders build Value SaaS startups | O...</td>\n", " <td>At Upekkha, we work with B2B SaaS startups to ...</td>\n", " <td>14</td>\n", " <td>0.137725</td>\n", " <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n", " </tr>\n", " <tr>\n", " <th>115</th>\n", " <td>https://www.linkedin.com/in/rahulguptarmv</td>\n", " <td>rahulguptarmv</td>\n", " <td>Helping early stage startups with growth, fund...</td>\n", " <td>Rahul has a decade of operational experience, ...</td>\n", " <td>14</td>\n", " <td>0.144676</td>\n", " <td>['startup', 'tech', 'build', 'start', 'entrepr...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "7 https://www.linkedin.com/in/nikhilguptadigital/ ... ['startup', 'tech', 'build', 'start', 'entrepr...\n", "19 https://in.linkedin.com/in/nshntdxt ... ['startup', 'tech', 'build', 'start', 'entrepr...\n", "63 https://www.linkedin.com/in/pratikpoddar ... ['startup', 'tech', 'build', 'start', 'entrepr...\n", "100 https://www.linkedin.com/in/prasannainindia ... ['startup', 'tech', 'build', 'start', 'entrepr...\n", "115 https://www.linkedin.com/in/rahulguptarmv ... ['startup', 'tech', 'build', 'start', 'entrepr...\n", "\n", "[5 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 14 } ] }, { "cell_type": "markdown", "metadata": { "id": "vXpLoQloaEVe" }, "source": [ "# **profiles of working in Invesment company or startups looking for investment**" ] }, { "cell_type": "code", "metadata": { "id": "8ANlLV67Zk60", "outputId": "06d309e9-0096-4af0-c779-a78216a63b74", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_investor = df[df['dominant_topic']==11]\n", "print(len(df_investor))\n", "df_investor.topic_prob.mean()" ], "execution_count": 15, "outputs": [ { "output_type": "stream", "text": [ "569\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.15687370162600825" ] }, "metadata": { "tags": [] }, "execution_count": 15 } ] }, { "cell_type": "code", "metadata": { "id": "EOQC5R_ea41P", "outputId": "efbecd8a-4ee4-4688-ce50-4de5610d3f56", "colab": { "base_uri": "https://localhost:8080/", "height": 159 } }, "source": [ "df_investor_ = df_investor[df_investor['topic_prob']>0.19]\n", "print(len(df_investor_))\n", "df_investor_.head(3)" ], "execution_count": 16, "outputs": [ { "output_type": "stream", "text": [ "136\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>6</th>\n", " <td>https://www.linkedin.com/in/karthikprabhakar</td>\n", " <td>karthikprabhakar</td>\n", " <td>Executive Director and Partner at Chiratae Ven...</td>\n", " <td>Karthik Prabhakar is the Executive Director an...</td>\n", " <td>11</td>\n", " <td>0.208158</td>\n", " <td>['company', 'investment', 'lead', 'venture', '...</td>\n", " </tr>\n", " <tr>\n", " <th>16</th>\n", " <td>https://www.linkedin.com/in/vishyvenugopalan</td>\n", " <td>vishyvenugopalan</td>\n", " <td>Senior Vice President Citi Ventures | AI, dat...</td>\n", " <td>Vishy is on the venture investment team at Cit...</td>\n", " <td>11</td>\n", " <td>0.210210</td>\n", " <td>['company', 'investment', 'lead', 'venture', '...</td>\n", " </tr>\n", " <tr>\n", " <th>33</th>\n", " <td>https://www.linkedin.com/in/sanatrao</td>\n", " <td>sanatrao</td>\n", " <td>Managing Partner at Shyn Capital</td>\n", " <td>Sanat Rao is a venture capital investor and M&...</td>\n", " <td>11</td>\n", " <td>0.279412</td>\n", " <td>['company', 'investment', 'lead', 'venture', '...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "6 https://www.linkedin.com/in/karthikprabhakar ... ['company', 'investment', 'lead', 'venture', '...\n", "16 https://www.linkedin.com/in/vishyvenugopalan ... ['company', 'investment', 'lead', 'venture', '...\n", "33 https://www.linkedin.com/in/sanatrao ... ['company', 'investment', 'lead', 'venture', '...\n", "\n", "[3 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 16 } ] }, { "cell_type": "markdown", "metadata": { "id": "s_ntRM9scrsc" }, "source": [ "# **Business Developer, Sales, Strategic growth developer**" ] }, { "cell_type": "code", "metadata": { "id": "37_dQ84jbdHQ", "outputId": "f69876fe-3d83-41be-f346-bf9fdb020f62", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_business = df[df['dominant_topic']==12]\n", "print(len(df_business))\n", "df_business.topic_prob.mean()" ], "execution_count": 17, "outputs": [ { "output_type": "stream", "text": [ "405\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.12857702210846964" ] }, "metadata": { "tags": [] }, "execution_count": 17 } ] }, { "cell_type": "code", "metadata": { "id": "Z5JTnKxEc7yU", "outputId": "a8dcdb25-cc8d-4cd8-970e-8dc02b31a95f", "colab": { "base_uri": "https://localhost:8080/", "height": 128 } }, "source": [ "df_business_ = df_business[df_business['topic_prob']>0.14]\n", "print(len(df_business_))\n", "df_business_.head(2)" ], "execution_count": 18, "outputs": [ { "output_type": "stream", "text": [ "127\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>21</th>\n", " <td>https://www.linkedin.com/in/rsudhirshenoy</td>\n", " <td>rsudhirshenoy</td>\n", " <td>Product Management, Marketing and Channel Mana...</td>\n", " <td>Experienced Professional in the Technology Ind...</td>\n", " <td>12</td>\n", " <td>0.143474</td>\n", " <td>['business', 'sale', 'strategy', 'growth', 'ma...</td>\n", " </tr>\n", " <tr>\n", " <th>29</th>\n", " <td>https://in.linkedin.com/in/rishi-srivastava-12...</td>\n", " <td>rishi-srivastava-120b005</td>\n", " <td>Founder AARK TechAccess, General Partner BOV C...</td>\n", " <td>Executive with 26+ years of international expe...</td>\n", " <td>12</td>\n", " <td>0.154725</td>\n", " <td>['business', 'sale', 'strategy', 'growth', 'ma...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "21 https://www.linkedin.com/in/rsudhirshenoy ... ['business', 'sale', 'strategy', 'growth', 'ma...\n", "29 https://in.linkedin.com/in/rishi-srivastava-12... ... ['business', 'sale', 'strategy', 'growth', 'ma...\n", "\n", "[2 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 18 } ] }, { "cell_type": "markdown", "metadata": { "id": "FfRLtqhugMqA" }, "source": [ "# **Management skills Profile**" ] }, { "cell_type": "code", "metadata": { "id": "xhrdMExGfpPH", "outputId": "ec589ab1-eb9c-4bf7-95bb-c0a14cb14b23", "colab": { "base_uri": "https://localhost:8080/", "height": 51 } }, "source": [ "df_manager = df[df['dominant_topic']==3]\n", "print(len(df_manager))\n", "df_manager.topic_prob.mean()" ], "execution_count": 19, "outputs": [ { "output_type": "stream", "text": [ "326\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "0.13591641734432516" ] }, "metadata": { "tags": [] }, "execution_count": 19 } ] }, { "cell_type": "code", "metadata": { "id": "spsGGNqRgie8", "outputId": "8c41777a-84be-4258-e939-2b0d6dc5481d", "colab": { "base_uri": "https://localhost:8080/", "height": 128 } }, "source": [ "df_manager_ = df_manager[df_manager['topic_prob']>0.14]\n", "print(len(df_manager_))\n", "df_manager_.head(2)" ], "execution_count": 20, "outputs": [ { "output_type": "stream", "text": [ "131\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>96</th>\n", " <td>https://www.linkedin.com/in/kievewang</td>\n", " <td>kievewang</td>\n", " <td>Sr. Executive Assistant</td>\n", " <td>Skills: Customer Service, Technical/Customer S...</td>\n", " <td>3</td>\n", " <td>0.181637</td>\n", " <td>['management', 'customer', 'manage', 'process'...</td>\n", " </tr>\n", " <tr>\n", " <th>228</th>\n", " <td>https://www.linkedin.com/in/imnikhilbhaskaran/</td>\n", " <td>imnikhilbhaskaran</td>\n", " <td>Associate Director, Customer Success at Icertis</td>\n", " <td>I am passionate about delivering world class T...</td>\n", " <td>3</td>\n", " <td>0.156695</td>\n", " <td>['management', 'customer', 'manage', 'process'...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "96 https://www.linkedin.com/in/kievewang ... ['management', 'customer', 'manage', 'process'...\n", "228 https://www.linkedin.com/in/imnikhilbhaskaran/ ... ['management', 'customer', 'manage', 'process'...\n", "\n", "[2 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 20 } ] }, { "cell_type": "markdown", "metadata": { "id": "EiJec0rlIoa7" }, "source": [ "# **HR Profiles**" ] }, { "cell_type": "code", "metadata": { "id": "3vVZRFWag9HS", "outputId": "074ecbc4-3428-441c-a086-e69c74ba3719", "colab": { "base_uri": "https://localhost:8080/", "height": 162 } }, "source": [ "df_hr = df[df['dominant_topic']==4]\n", "print(len(df_hr))\n", "df_hr.head(2)" ], "execution_count": 21, "outputs": [ { "output_type": "stream", "text": [ "309\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/html": [ "<div>\n", "<style scoped>\n", " .dataframe tbody tr th:only-of-type {\n", " vertical-align: middle;\n", " }\n", "\n", " .dataframe tbody tr th {\n", " vertical-align: top;\n", " }\n", "\n", " .dataframe thead th {\n", " text-align: right;\n", " }\n", "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", " <th></th>\n", " <th>url</th>\n", " <th>username</th>\n", " <th>title</th>\n", " <th>about</th>\n", " <th>dominant_topic</th>\n", " <th>topic_prob</th>\n", " <th>keywords</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", " <tr>\n", " <th>178</th>\n", " <td>https://www.linkedin.com/in/jonathan-parisot-1...</td>\n", " <td>jonathan-parisot-10054625</td>\n", " <td>Co-Founder & CEO at Actiondesk (YC S19)</td>\n", " <td>Building Actiondesk, a spreadsheet software na...</td>\n", " <td>4</td>\n", " <td>0.087037</td>\n", " <td>['team', 'build', 'lead', 'leadership', 'organ...</td>\n", " </tr>\n", " <tr>\n", " <th>298</th>\n", " <td>https://www.linkedin.com/in/meghan-herring-049...</td>\n", " <td>meghan-herring-0491635</td>\n", " <td>Senior Technical Recruiter</td>\n", " <td>SENIOR TECHNICAL RECRUITER / HUMAN RESOURCES L...</td>\n", " <td>4</td>\n", " <td>0.140461</td>\n", " <td>['team', 'build', 'lead', 'leadership', 'organ...</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ " url ... keywords\n", "178 https://www.linkedin.com/in/jonathan-parisot-1... ... ['team', 'build', 'lead', 'leadership', 'organ...\n", "298 https://www.linkedin.com/in/meghan-herring-049... ... ['team', 'build', 'lead', 'leadership', 'organ...\n", "\n", "[2 rows x 7 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 21 } ] }, { "cell_type": "markdown", "metadata": { "id": "ppfPSV0ZU5gI" }, "source": [ "**Marking rows for about section description with keywords of HR**" ] }, { "cell_type": "code", "metadata": { "id": "TKB9De2SJU9_", "outputId": "4c824920-24b8-46f6-b71d-1d781f8856a9", "colab": { "base_uri": "https://localhost:8080/", "height": 119 } }, "source": [ "hr_word = set(['hr ', ' hr' 'human resource', 'recruit'])\n", "df_hr['checking_hr_word'] = df_hr.about.apply(lambda x: ['true' for word in hr_word if word in x.lower() ])\n", "# df_hr.head(2)" ], "execution_count": 22, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " \n" ], "name": "stderr" } ] }, { "cell_type": "markdown", "metadata": { "id": "OoJYNcQ_VQqT" }, "source": [ "**collecting dataframe row_index where an about section has HR related words**" ] }, { "cell_type": "code", "metadata": { "id": "xq_yj5BQKHc3" }, "source": [ "row_index_with_hr_words = []\n", "for row in df_hr.index:\n", " if len(df_hr['checking_hr_word'][row])!=0:\n", " row_index_with_hr_words.append(row)\n", " # print(row,'\\n')" ], "execution_count": 23, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "gxcBB06-VrIm" }, "source": [ "**Collecting final Rows with HR related about section**" ] }, { "cell_type": "code", "metadata": { "id": "Gbumj-5cM6WW" }, "source": [ "final_df_hr_list = []\n", "for index in row_index_with_hr_words:\n", " url = df_hr.url[index]\n", " username = df_hr.username[index]\n", " title = df_hr.title[index]\n", " about = df_hr.about[index]\n", " dominant_topic = df_hr.dominant_topic[index]\n", " topic_prob = df_hr.topic_prob[index]\n", " keywords = df_hr.keywords[index]\n", " dic = {'url':url,'username':username,'title':title,'about':about,'dominant_topic':dominant_topic,'topic_prob':topic_prob,'keywords':keywords}\n", " final_df_hr_list.append(dic)\n", " \n", "df_hr_ = pd.DataFrame(final_df_hr_list)" ], "execution_count": 24, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "uAEIOZ8vSyf-", "outputId": "a2968841-fab5-410d-f1d5-83dbbc0f4699", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "df_hr_ = df_hr_[df_hr_['topic_prob']>0.15]\n", "len(df_hr_)\n" ], "execution_count": 25, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "85" ] }, "metadata": { "tags": [] }, "execution_count": 25 } ] }, { "cell_type": "markdown", "metadata": { "id": "P7XRsRqbXSPJ" }, "source": [ "# **Final Data Preparation for Statistical Model Classification**" ] }, { "cell_type": "code", "metadata": { "id": "wgxy8cf-XcKz" }, "source": [ "df_final_data = pd.concat([df_marketing_,df_pm_,df_data_,df_sw_, df_entrepreneur_, df_investor_,df_manager_,df_business_,df_hr_],axis=0)" ], "execution_count": 26, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "DpeZ3nGqYXRF", "outputId": "ab0e544d-d912-4c3e-d45a-af4538a1828c", "colab": { "base_uri": "https://localhost:8080/", "height": 34 } }, "source": [ "len(df_final_data)" ], "execution_count": 27, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "1365" ] }, "metadata": { "tags": [] }, "execution_count": 27 } ] }, { "cell_type": "markdown", "metadata": { "id": "sm9j-OXtcPWx" }, "source": [ "# **Export final data frame**" ] }, { "cell_type": "code", "metadata": { "id": "YII0kg0Hcs3W" }, "source": [ "df_final_data.to_csv('/content/drive/My Drive/NLP/with_swaraj/Data/data_for_statistical_classification_1.csv')" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "yZywI1nndJes" }, "source": [ "" ], "execution_count": null, "outputs": [] } ] }