{ "cells": [ { "cell_type": "code", "execution_count": 40, "id": "b88ff5db-b83e-449e-96e4-617be37b3fd0", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\admin\\AppData\\Local\\Temp\\ipykernel_17756\\1736831531.py:7: ElasticsearchWarning: Elasticsearch built-in security features are not enabled. Without authentication, your cluster could be accessible to anyone. See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/security-minimal-setup.html to enable security.\n", " response = es.search(body=query, index= \"daily_signup_new\" )\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
account_type_idaccount_type_countuser_from_iduser_from_countdoc_hashreport_date
0aadhaar, demographic, non_aadhaar607, 53, 8623E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jun 9, 2017 @ 05:30:00.000
1aadhaar, demographic, non_aadhaar67, 513, 1623E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3May 9, 2018 @ 05:30:00.000
2aadhaar, demographic, non_aadhaar167, 53, 123E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Oct 9, 2018 @ 05:30:00.000
3aadhaar, demographic, non_aadhaar1,627, 353, 1523E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Feb 9, 2019 @ 05:30:00.000
4aadhaar, demographic, non_aadhaar16, 3, 123E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Dec 9, 2019 @ 05:30:00.000
5aadhaar, demographic, non_aadhaar216, 23, 1223E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Nov 9, 2020 @ 05:30:00.000
6aadhaar, demographic, non_aadhaar2,816, 238, 1,8223E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Sep 9, 2020 @ 05:30:00.000
7aadhaar, demographic, non_aadhaar81, 23, 183E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Sep 2, 2020 @ 05:30:00.000
8aadhaar, demographic, non_aadhaar8, 8, 83E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Sep 3, 2020 @ 05:30:00.000
9aadhaar, demographic, non_aadhaar8, 8, 83E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Sep 3, 2021 @ 05:30:00.000
10aadhaar, demographic, non_aadhaar85, 78, 8993E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Apr 3, 2021 @ 05:30:00.000
11aadhaar, demographic, non_aadhaar875, 768, 893E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Aug 3, 2021 @ 05:30:00.000
12aadhaar, demographic, non_aadhaar75, 768, 893E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Aug 3, 2022 @ 05:30:00.000
13aadhaar, demographic, non_aadhaar175, 68, 1893E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Sep 3, 2022 @ 05:30:00.000
14aadhaar, demographic, non_aadhaar15, 8, 193E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jul 3, 2022 @ 05:30:00.000
15aadhaar, demographic, non_aadhaar155, 58, 1593E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3May 3, 2023 @ 05:30:00.000
16aadhaar, demographic, non_aadhaar855, 588, 1693E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Nov 3, 2023 @ 05:30:00.000
17aadhaar, demographic, non_aadhaar167, 159, 2673E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Nov 9, 2015 @ 05:30:00.000
18aadhaar, demographic, non_aadhaar467, 1,593, 263E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Nov 9, 2016 @ 05:30:00.000
19aadhaar, demographic, non_aadhaar1,697, 1,559, 2,7673E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Mar 9, 2015 @ 05:30:00.000
20aadhaar, demographic, non_aadhaar167, 159, 2673E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Sep 9, 2015 @ 05:30:00.000
21aadhaar, demographic, non_aadhaar8,525, 2,588, 3,1693E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jan 3, 2023 @ 05:30:00.000
22aadhaar, demographic, non_aadhaar467, 1,593, 263E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Nov 9, 2024 @ 05:30:00.000
23aadhaar, demographic, non_aadhaar67, 593, 863E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jan 9, 2016 @ 05:30:00.000
24aadhaar, demographic, non_aadhaar607, 53, 8623E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jan 9, 2017 @ 05:30:00.000
25aadhaar, demographic, non_aadhaar85, 88, 693E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jan 3, 2024 @ 05:30:00.000
26aadhaar, demographic, non_aadhaar83,434, 8,834, 6,9343E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Feb 6, 2024 @ 05:30:00.000
27aadhaar, demographic, non_aadhaar36, 34, 793E743, VL544CF, F542F1, 07460, 8146C, VID4, 17...343, 393,771, 15,312, 137,956, 37, 385,541, 1,...0511424ffsaiom96900e217d3Jan 26, 2020 @ 05:30:00.000
\n", "
" ], "text/plain": [ " account_type_id account_type_count \\\n", "0 aadhaar, demographic, non_aadhaar 607, 53, 862 \n", "1 aadhaar, demographic, non_aadhaar 67, 513, 162 \n", "2 aadhaar, demographic, non_aadhaar 167, 53, 12 \n", "3 aadhaar, demographic, non_aadhaar 1,627, 353, 152 \n", "4 aadhaar, demographic, non_aadhaar 16, 3, 12 \n", "5 aadhaar, demographic, non_aadhaar 216, 23, 122 \n", "6 aadhaar, demographic, non_aadhaar 2,816, 238, 1,822 \n", "7 aadhaar, demographic, non_aadhaar 81, 23, 18 \n", "8 aadhaar, demographic, non_aadhaar 8, 8, 8 \n", "9 aadhaar, demographic, non_aadhaar 8, 8, 8 \n", "10 aadhaar, demographic, non_aadhaar 85, 78, 899 \n", "11 aadhaar, demographic, non_aadhaar 875, 768, 89 \n", "12 aadhaar, demographic, non_aadhaar 75, 768, 89 \n", "13 aadhaar, demographic, non_aadhaar 175, 68, 189 \n", "14 aadhaar, demographic, non_aadhaar 15, 8, 19 \n", "15 aadhaar, demographic, non_aadhaar 155, 58, 159 \n", "16 aadhaar, demographic, non_aadhaar 855, 588, 169 \n", "17 aadhaar, demographic, non_aadhaar 167, 159, 267 \n", "18 aadhaar, demographic, non_aadhaar 467, 1,593, 26 \n", "19 aadhaar, demographic, non_aadhaar 1,697, 1,559, 2,767 \n", "20 aadhaar, demographic, non_aadhaar 167, 159, 267 \n", "21 aadhaar, demographic, non_aadhaar 8,525, 2,588, 3,169 \n", "22 aadhaar, demographic, non_aadhaar 467, 1,593, 26 \n", "23 aadhaar, demographic, non_aadhaar 67, 593, 86 \n", "24 aadhaar, demographic, non_aadhaar 607, 53, 862 \n", "25 aadhaar, demographic, non_aadhaar 85, 88, 69 \n", "26 aadhaar, demographic, non_aadhaar 83,434, 8,834, 6,934 \n", "27 aadhaar, demographic, non_aadhaar 36, 34, 79 \n", "\n", " user_from_id \\\n", "0 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "1 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "2 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "3 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "4 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "5 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "6 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "7 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "8 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "9 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "10 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "11 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "12 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "13 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "14 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "15 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "16 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "17 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "18 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "19 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "20 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "21 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "22 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "23 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "24 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "25 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "26 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "27 3E743, VL544CF, F542F1, 07460, 8146C, VID4, 17... \n", "\n", " user_from_count \\\n", "0 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "1 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "2 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "3 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "4 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "5 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "6 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "7 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "8 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "9 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "10 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "11 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "12 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "13 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "14 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "15 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "16 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "17 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "18 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "19 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "20 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "21 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "22 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "23 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "24 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "25 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "26 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "27 343, 393,771, 15,312, 137,956, 37, 385,541, 1,... \n", "\n", " doc_hash report_date \n", "0 0511424ffsaiom96900e217d3 Jun 9, 2017 @ 05:30:00.000 \n", "1 0511424ffsaiom96900e217d3 May 9, 2018 @ 05:30:00.000 \n", "2 0511424ffsaiom96900e217d3 Oct 9, 2018 @ 05:30:00.000 \n", "3 0511424ffsaiom96900e217d3 Feb 9, 2019 @ 05:30:00.000 \n", "4 0511424ffsaiom96900e217d3 Dec 9, 2019 @ 05:30:00.000 \n", "5 0511424ffsaiom96900e217d3 Nov 9, 2020 @ 05:30:00.000 \n", "6 0511424ffsaiom96900e217d3 Sep 9, 2020 @ 05:30:00.000 \n", "7 0511424ffsaiom96900e217d3 Sep 2, 2020 @ 05:30:00.000 \n", "8 0511424ffsaiom96900e217d3 Sep 3, 2020 @ 05:30:00.000 \n", "9 0511424ffsaiom96900e217d3 Sep 3, 2021 @ 05:30:00.000 \n", "10 0511424ffsaiom96900e217d3 Apr 3, 2021 @ 05:30:00.000 \n", "11 0511424ffsaiom96900e217d3 Aug 3, 2021 @ 05:30:00.000 \n", "12 0511424ffsaiom96900e217d3 Aug 3, 2022 @ 05:30:00.000 \n", "13 0511424ffsaiom96900e217d3 Sep 3, 2022 @ 05:30:00.000 \n", "14 0511424ffsaiom96900e217d3 Jul 3, 2022 @ 05:30:00.000 \n", "15 0511424ffsaiom96900e217d3 May 3, 2023 @ 05:30:00.000 \n", "16 0511424ffsaiom96900e217d3 Nov 3, 2023 @ 05:30:00.000 \n", "17 0511424ffsaiom96900e217d3 Nov 9, 2015 @ 05:30:00.000 \n", "18 0511424ffsaiom96900e217d3 Nov 9, 2016 @ 05:30:00.000 \n", "19 0511424ffsaiom96900e217d3 Mar 9, 2015 @ 05:30:00.000 \n", "20 0511424ffsaiom96900e217d3 Sep 9, 2015 @ 05:30:00.000 \n", "21 0511424ffsaiom96900e217d3 Jan 3, 2023 @ 05:30:00.000 \n", "22 0511424ffsaiom96900e217d3 Nov 9, 2024 @ 05:30:00.000 \n", "23 0511424ffsaiom96900e217d3 Jan 9, 2016 @ 05:30:00.000 \n", "24 0511424ffsaiom96900e217d3 Jan 9, 2017 @ 05:30:00.000 \n", "25 0511424ffsaiom96900e217d3 Jan 3, 2024 @ 05:30:00.000 \n", "26 0511424ffsaiom96900e217d3 Feb 6, 2024 @ 05:30:00.000 \n", "27 0511424ffsaiom96900e217d3 Jan 26, 2020 @ 05:30:00.000 " ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query ={\n", " \"query\": {\n", " \"match_all\": {}\n", " },\n", " \"size\":10000\n", " }\n", "response = es.search(body=query, index= \"daily_signup_new\" )\n", "hits = response[\"hits\"][\"hits\"]\n", "source_data = [hit[\"_source\"] for hit in hits]\n", "ls = []\n", "for i in hits:\n", " temp = {}\n", " temp['account_type_id'] = i['_source']['account_type__id_keyword']\n", " temp['account_type_count'] = i['_source']['account_type_count']\n", " temp['user_from_id'] = i['_source']['user_from__id']\n", " temp['user_from_count'] = i['_source']['user_from_count']\n", " temp['doc_hash'] = i['_source']['doc_hash']\n", " temp['report_date'] = i['_source']['report_date']\n", " ls.append(temp)\n", "df = pd.DataFrame(ls)\n", "df" ] }, { "cell_type": "code", "execution_count": null, "id": "5bc5181b-5f91-4c38-86b6-2bfad5dbd795", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "6f92d399-d73e-4954-8904-ea750da9ff44", "metadata": {}, "source": [ "# Final Script" ] }, { "cell_type": "code", "execution_count": 1, "id": "f6caeecf-55e4-45de-9b92-259f9c51ac08", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from elasticsearch import Elasticsearch\n", "from elasticsearch.helpers import scan\n", "from datetime import datetime\n", "import json\n", "import hashlib" ] }, { "cell_type": "code", "execution_count": 26, "id": "3fda91da-f647-42a6-a439-879fdf292e72", "metadata": {}, "outputs": [], "source": [ "def preprocessing(es):\n", " query ={\n", " \"query\": {\n", " \"match_all\": {}\n", " },\n", " \"size\":10000\n", " }\n", " response = es.search(body=query, index= \"daily_signup_new\" )\n", " hits = response[\"hits\"][\"hits\"]\n", " source_data = [hit[\"_source\"] for hit in hits]\n", " ls = []\n", " for i in hits:\n", " temp = {}\n", " temp['account_type_id'] = i['_source']['account_type__id_keyword']\n", " temp['account_type_count'] = i['_source']['account_type_count']\n", " temp['user_from_id'] = i['_source']['user_from__id']\n", " temp['user_from_count'] = i['_source']['user_from_count']\n", " temp['doc_hash'] = i['_source']['doc_hash']\n", " temp['report_date'] = i['_source']['report_date']\n", " ls.append(temp)\n", " df = pd.DataFrame(ls)\n", " df['account_type_id2'] = df['account_type_id'].map(lambda x:str(x).replace(' ','').split(','))\n", " df['account_type_count2'] = df['account_type_count'].map(lambda x:str(x).replace(' ','').split(','))\n", " df['user_from_id2'] = df['user_from_id'].map(lambda x:str(x).replace(' ','').split(','))\n", " df['user_from_count2'] =df['user_from_count'].map(lambda x:str(x).replace(',','').split(' '))\n", " ls1 = []\n", " ls2 = []\n", " for i in range(df.shape[0]):\n", " temp1 = {}\n", " temp2 = {}\n", " for j in range(0,len(df['account_type_id2'][i])):\n", " temp1[str(df['account_type_id2'][i][j])] = int(df['account_type_count2'][i][j])\n", " ls1.append(temp1)\n", " for j in range(0,len(df['user_from_id2'][i])):\n", " temp2[str(df['user_from_id2'][i][j])] = int(df['user_from_count2'][i][j])\n", " ls2.append(temp2)\n", " df['account_type_stats'] = ls1\n", " df['user_type_stats'] = ls2\n", " df.drop(columns = ['account_type_id','account_type_count','user_from_id','user_from_count','account_type_id2','account_type_count2'\n", " ,'user_from_id2','user_from_count2'], inplace=True)\n", " format = '%b %d, %Y @ %H:%M:%S.%f'\n", " df['report_date']=df['report_date'].map(lambda x: datetime.strptime(x, format))\n", " return df" ] }, { "cell_type": "code", "execution_count": 3, "id": "01ecd3b5-c37a-4ebb-8c50-e245fed2ab36", "metadata": {}, "outputs": [], "source": [ "def dictsum1(list_of_dicts):\n", " summed_dict = {}\n", " total = 0\n", " for item in list_of_dicts:\n", " for key, value in item.items():\n", " total += value\n", " summed_dict['total_account_type'] = total\n", " summed_dict[key] = summed_dict.get(key, 0) + value\n", " return summed_dict" ] }, { "cell_type": "code", "execution_count": 4, "id": "228a13ad-8c01-4cee-8c4f-6875ce26d550", "metadata": {}, "outputs": [], "source": [ "def dictsum2(list_of_dicts):\n", " summed_dict = {}\n", " total = 0\n", " for item in list_of_dicts:\n", " for key, value in item.items():\n", " total += value\n", " summed_dict['total_user_type'] = total\n", " summed_dict[key] = summed_dict.get(key, 0) + value\n", " return summed_dict" ] }, { "cell_type": "code", "execution_count": 6, "id": "b3f6b7f7-058f-43d7-803c-213c2873a3e3", "metadata": {}, "outputs": [], "source": [ "def data_to_elastic(data):\n", " es = Elasticsearch(\"http://localhost:9200\")\n", " for i in range(data.shape[0]):\n", " year = data.iloc[i].name\n", " doc_id = hashlib.md5(str(year).encode()).hexdigest()\n", " print(f'stats_account_{year}')\n", " tmp = data.iloc[i].to_dict()\n", " print(tmp)\n", " resp = es.index(index=f'account_stats_{year}', id=doc_id, document=tmp)" ] }, { "cell_type": "code", "execution_count": 7, "id": "84741677-5fe1-4325-ad4c-9a7908212f66", "metadata": {}, "outputs": [], "source": [ "es = Elasticsearch(\"http://localhost:9200\")" ] }, { "cell_type": "code", "execution_count": 27, "id": "db664a6e-1769-4f24-8ac6-dcd2d7529d19", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\admin\\AppData\\Local\\Temp\\ipykernel_17756\\3227805958.py:8: ElasticsearchWarning: Elasticsearch built-in security features are not enabled. Without authentication, your cluster could be accessible to anyone. See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/security-minimal-setup.html to enable security.\n", " response = es.search(body=query, index= \"daily_signup_new\" )\n" ] } ], "source": [ "fin_df = preprocessing(es)" ] }, { "cell_type": "code", "execution_count": 29, "id": "2d305a1b-09ab-4e20-836c-a3b0b64c35aa", "metadata": {}, "outputs": [], "source": [ "df_user = fin_df.groupby(fin_df['report_date'].dt.year)['user_type_stats'].apply(dictsum2).reset_index()" ] }, { "cell_type": "code", "execution_count": 18, "id": "e1020964-6fa0-4c4d-aa44-3f171783f4ca", "metadata": {}, "outputs": [], "source": [ "df_acc = fin_df.groupby(fin_df['report_date'].dt.year)['account_type_stats'].apply(dictsum1).reset_index()" ] }, { "cell_type": "code", "execution_count": 35, "id": "68a95bef-d96f-4dfd-9467-94fce7fcbbec", "metadata": {}, "outputs": [], "source": [ "df_user = df_user.set_index(['report_date','level_1'])['user_type_stats'].unstack(fill_value=0)" ] }, { "cell_type": "code", "execution_count": 36, "id": "8ff4c432-a8cc-4bb0-8c21-d4b09e7da672", "metadata": {}, "outputs": [], "source": [ "df_acc = df_acc.reset_index().set_index(['report_date','level_1'])['account_type_stats'].unstack(fill_value=0)" ] }, { "cell_type": "code", "execution_count": 37, "id": "a152fb2d-4374-407e-9af7-fd78596077bd", "metadata": {}, "outputs": [], "source": [ "final_df = pd.merge(df_user, df_acc, on='report_date')" ] }, { "cell_type": "code", "execution_count": 38, "id": "44e9603e-c676-4d87-8059-f22d634176b9", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
level_10746008430414C0C4174B3E7438146C9343F542F1K4WFIQNASX87FVID4VL544CFtotal_user_typeaadhaardemographicnon_aadhaartotal_account_type
report_date
20154138689991863495102911194593635409634265711566231181313316114833510155351885
201627591266612423306867463062423606422843877108278754221074325345946791807
20172759126661242330686746306242360642284387710827875422107432121410617243044
20182759126661242330686746306242360642284387710827875422107432234566174974
20192759126661242330686746306242360642284387710827875422107432176303651012
202068978016653105825171518515765605901605710951927705196885552685803439044651712
2021413868999186349510291119459363540963426571156623118131331611489688549962818
2022413868999186349510291119459363540963426571156623118131331611482658442971406
202341386899918634951029111945936354096342657115662311813133161148101811713302519
2024413868999186349510291119459363540963426571156623118131331611486355236701828
\n", "
" ], "text/plain": [ "level_1 07460 084304 14C0C4 174B 3E743 8146C 9343 F542F1 K4WFIQ \\\n", "report_date \n", "2015 413868 999 186 3495 1029 111 9 45936 354096 \n", "2016 275912 666 124 2330 686 74 6 30624 236064 \n", "2017 275912 666 124 2330 686 74 6 30624 236064 \n", "2018 275912 666 124 2330 686 74 6 30624 236064 \n", "2019 275912 666 124 2330 686 74 6 30624 236064 \n", "2020 689780 1665 310 5825 1715 185 15 76560 590160 \n", "2021 413868 999 186 3495 1029 111 9 45936 354096 \n", "2022 413868 999 186 3495 1029 111 9 45936 354096 \n", "2023 413868 999 186 3495 1029 111 9 45936 354096 \n", "2024 413868 999 186 3495 1029 111 9 45936 354096 \n", "\n", "level_1 NA SX87F VID4 VL544CF total_user_type aadhaar \\\n", "report_date \n", "2015 3426 57 1156623 1181313 3161148 335 \n", "2016 2284 38 771082 787542 2107432 534 \n", "2017 2284 38 771082 787542 2107432 1214 \n", "2018 2284 38 771082 787542 2107432 234 \n", "2019 2284 38 771082 787542 2107432 17 \n", "2020 5710 95 1927705 1968855 5268580 343 \n", "2021 3426 57 1156623 1181313 3161148 968 \n", "2022 3426 57 1156623 1181313 3161148 265 \n", "2023 3426 57 1156623 1181313 3161148 1018 \n", "2024 3426 57 1156623 1181313 3161148 635 \n", "\n", "level_1 demographic non_aadhaar total_account_type \n", "report_date \n", "2015 1015 535 1885 \n", "2016 594 679 1807 \n", "2017 106 1724 3044 \n", "2018 566 174 974 \n", "2019 630 365 1012 \n", "2020 904 465 1712 \n", "2021 854 996 2818 \n", "2022 844 297 1406 \n", "2023 1171 330 2519 \n", "2024 523 670 1828 " ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_df" ] }, { "cell_type": "code", "execution_count": 39, "id": "50074794-1995-4368-ad91-3f7a1daa1dc2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "stats_account_2015\n", "{'07460': 413868, '084304': 999, '14C0C4': 186, '174B': 3495, '3E743': 1029, '8146C': 111, '9343': 9, 'F542F1': 45936, 'K4WFIQ': 354096, 'NA': 3426, 'SX87F': 57, 'VID4': 1156623, 'VL544CF': 1181313, 'total_user_type': 3161148, 'aadhaar': 335, 'demographic': 1015, 'non_aadhaar': 535, 'total_account_type': 1885}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\admin\\AppData\\Local\\Temp\\ipykernel_17756\\867504021.py:9: ElasticsearchWarning: Elasticsearch built-in security features are not enabled. Without authentication, your cluster could be accessible to anyone. See https://www.elastic.co/guide/en/elasticsearch/reference/7.17/security-minimal-setup.html to enable security.\n", " resp = es.index(index=f'account_stats_{year}', id=doc_id, document=tmp)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "stats_account_2016\n", "{'07460': 275912, '084304': 666, '14C0C4': 124, '174B': 2330, '3E743': 686, '8146C': 74, '9343': 6, 'F542F1': 30624, 'K4WFIQ': 236064, 'NA': 2284, 'SX87F': 38, 'VID4': 771082, 'VL544CF': 787542, 'total_user_type': 2107432, 'aadhaar': 534, 'demographic': 594, 'non_aadhaar': 679, 'total_account_type': 1807}\n", "stats_account_2017\n", "{'07460': 275912, '084304': 666, '14C0C4': 124, '174B': 2330, '3E743': 686, '8146C': 74, '9343': 6, 'F542F1': 30624, 'K4WFIQ': 236064, 'NA': 2284, 'SX87F': 38, 'VID4': 771082, 'VL544CF': 787542, 'total_user_type': 2107432, 'aadhaar': 1214, 'demographic': 106, 'non_aadhaar': 1724, 'total_account_type': 3044}\n", "stats_account_2018\n", "{'07460': 275912, '084304': 666, '14C0C4': 124, '174B': 2330, '3E743': 686, '8146C': 74, '9343': 6, 'F542F1': 30624, 'K4WFIQ': 236064, 'NA': 2284, 'SX87F': 38, 'VID4': 771082, 'VL544CF': 787542, 'total_user_type': 2107432, 'aadhaar': 234, 'demographic': 566, 'non_aadhaar': 174, 'total_account_type': 974}\n", "stats_account_2019\n", "{'07460': 275912, '084304': 666, '14C0C4': 124, '174B': 2330, '3E743': 686, '8146C': 74, '9343': 6, 'F542F1': 30624, 'K4WFIQ': 236064, 'NA': 2284, 'SX87F': 38, 'VID4': 771082, 'VL544CF': 787542, 'total_user_type': 2107432, 'aadhaar': 17, 'demographic': 630, 'non_aadhaar': 365, 'total_account_type': 1012}\n", "stats_account_2020\n", "{'07460': 689780, '084304': 1665, '14C0C4': 310, '174B': 5825, '3E743': 1715, '8146C': 185, '9343': 15, 'F542F1': 76560, 'K4WFIQ': 590160, 'NA': 5710, 'SX87F': 95, 'VID4': 1927705, 'VL544CF': 1968855, 'total_user_type': 5268580, 'aadhaar': 343, 'demographic': 904, 'non_aadhaar': 465, 'total_account_type': 1712}\n", "stats_account_2021\n", "{'07460': 413868, '084304': 999, '14C0C4': 186, '174B': 3495, '3E743': 1029, '8146C': 111, '9343': 9, 'F542F1': 45936, 'K4WFIQ': 354096, 'NA': 3426, 'SX87F': 57, 'VID4': 1156623, 'VL544CF': 1181313, 'total_user_type': 3161148, 'aadhaar': 968, 'demographic': 854, 'non_aadhaar': 996, 'total_account_type': 2818}\n", "stats_account_2022\n", "{'07460': 413868, '084304': 999, '14C0C4': 186, '174B': 3495, '3E743': 1029, '8146C': 111, '9343': 9, 'F542F1': 45936, 'K4WFIQ': 354096, 'NA': 3426, 'SX87F': 57, 'VID4': 1156623, 'VL544CF': 1181313, 'total_user_type': 3161148, 'aadhaar': 265, 'demographic': 844, 'non_aadhaar': 297, 'total_account_type': 1406}\n", "stats_account_2023\n", "{'07460': 413868, '084304': 999, '14C0C4': 186, '174B': 3495, '3E743': 1029, '8146C': 111, '9343': 9, 'F542F1': 45936, 'K4WFIQ': 354096, 'NA': 3426, 'SX87F': 57, 'VID4': 1156623, 'VL544CF': 1181313, 'total_user_type': 3161148, 'aadhaar': 1018, 'demographic': 1171, 'non_aadhaar': 330, 'total_account_type': 2519}\n", "stats_account_2024\n", "{'07460': 413868, '084304': 999, '14C0C4': 186, '174B': 3495, '3E743': 1029, '8146C': 111, '9343': 9, 'F542F1': 45936, 'K4WFIQ': 354096, 'NA': 3426, 'SX87F': 57, 'VID4': 1156623, 'VL544CF': 1181313, 'total_user_type': 3161148, 'aadhaar': 635, 'demographic': 523, 'non_aadhaar': 670, 'total_account_type': 1828}\n" ] } ], "source": [ "data_to_elastic(final_df)" ] }, { "cell_type": "code", "execution_count": null, "id": "2e3b4fa7-beb9-4bca-9e4e-e6a487f0d25d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.0" } }, "nbformat": 4, "nbformat_minor": 5 }