import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
sns.set_style("whitegrid")

covid = pd.read_csv('data/dataset.csv')
covid

warnings.simplefilter(action='ignore', category=FutureWarning)

covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Columns: 123 entries, City to Which of these emotions do you associate with COVID-19?
dtypes: float64(23), object(100)
memory usage: 2.8+ MB

covid.rename({'At the beginning of the pandemic did you work or study?': 'Work_Study_Status','Family':'Live_with_Family',
                'Friends':'Live_with_Friends', 'Pets':'Live_with_Pets', 'Roommates':'Live_with_Roommates',
                "People I didn't know":'Live_with_Acquaintance', 'Alone':"Live_Alone"}, axis=1, inplace=True)

covid.rename({'Bored.3c':'Feel_During_Bored', 'Stressed.3c':'Feel_During_Stressed', 'Free.3c':'Feel_During_Free', 
              'Optimistic.3c':'Feel_During_Optimistic', 'Peaceful.3c':'Feel_During_Peaceful','Lonely.3c':'Feel_During_Lonely',
              'Disappointed.3c':'Feel_During_Disappointed', 'Frustrated.3c':'Feel_During_Frustrated'}, axis=1, inplace=True)

covid.rename({ 'Threatened.3c':'Feel_During_Threatened','Bored.3e':'Talk_Bored', 'Stressed.3e':'Talk_Stressed',
              'Excited.3e':'Talk_Excited','Hopeful.3e':'Talk_Hopeful', 'Peaceful.3e':'Talk_Peaceful', 'Lonely.3e':'Talk_Lonely',
              'Vulnerable.3e':'Talk_Vulnerable','Disappointed.3e':'Talk_Disappointed', 'Anxious.3e':'Talk_Anxious'}, axis=1, inplace=True)

covid.rename({'Mask mandate.2020':'mask20','Lockdown.2020':'lock20','Closing Workplace/University.2020':'close20',
              'Vaccination pass.2020':'vaccine20','Social distancing.2020':'distance20','Quarantine.2020':'quarent20',
              'Mask mandate.2021':'mask21','Lockdown.2021':'lock21','Closing Workplace/University.2021':'close21',
              'Vaccination pass.2021':'vaccine21','Social distancing.2021':'distance21','Quarantine.2021':'quarent21'
                     }, axis=1,inplace=True)

covid.rename({'Proud.4c':'Feel_novacc_Proud','Optimistic.4c':'Feel_novacc_Optimistic',
              'Indifferent.4c':'Feel_novacc_Indifferent','Vulnerable.4c':'Feel_novacc_Vulnerable',
              'Disappointed.4c':'Feel_novacc_Disappointed','Hostile.4c':'Feel_novacc_Hostile',
              'Threatened.4c':'Feel_novacc_Threatened','Anxious.4c':'Feel_novacc_Anxious','Free.4c':'Feel_novacc_Free'
                     }, axis=1,inplace=True)

covid.rename({'Yes, in 2020':'positive20','Yes, in 2021':'positive21','Yes, in 2022':'positive22'},axis=1,inplace=True)

covid.rename({'Stressed.5b':'Feel_sick_Stressed','Optimistic.5b':'Feel_sick_Optimistic',
                      'Peaceful.5b':'Feel_sick_Peace','Lonely.5b':'Feel_sick_Lonely',
                      'Vulnerable.5b':'Feel_sick_Vulnerable','Disappointed.5b':'Feel_sick_Disappointed',
                      'Frustrated.5b':'Feel_sick_Frustrated','Anxious.5b':'Feel_sick_Anxious',
                      'Threatened.5b':'Feel_sick_Threatened'
                     }, axis=1,inplace=True)

covid.rename({'Bored.6b':'Feel_study_Bored','Stressed.6b':'Feel_study_Stressed','Free.6b':'Feel_study_Free',
                      'Proud.6b':'Feel_study_Proud','Optimistic.6b':'Feel_study_Optimistic','Lonely.6b':'Feel_study_Lonely',
                      'Disappointed.6b':'Feel_study_Disappointed','Frustrated.6b':'Feel_study_Frustrated',
                      'Anxious.6b':'Feel_study_Anxious'
                     }, axis=1,inplace=True)

covid.rename({'Free.7c': 'Feel_profess_Free' ,'Aggressive.7c': 'Feel_profess_Aggressive' , 
                      'Frustrated.7c': 'Feel_profess_Frustrated' ,'Disappointed.7c': 'Feel_profess_Disappointed' ,
                      'Vulnerable.7c': 'Feel_profess_Vulnerable' ,'Lonely.7c': 'Feel_profess_Lonely' ,
                      'Optimistic.7c': 'Feel_profess_Optimistic' ,'Enthusiastic.7c': 'Feel_profess_Enthusiastic' ,
                      'Stressed.7c': 'Feel_profess_Stressed' ,
                       }, axis=1,inplace=True)

covid.rename({'Did your professional situation change during COVID-19?': 'prof_situation_change' ,
                      'Meeting a small group of friends (2-6 people)': 'Comfort_group_21' ,
                      'Going to public places (cinemas, theaters, restaurants, gym)': 'Comfort_go_public_21' ,
                      'Taking public transportation' : 'Comfort_publictrans_21' ,
                      'Going to a crowded event outdoors' : 'Comfort_event_outdoor_21' ,
                      'Meeting a small group of friends (2-6 people).1' : 'Comfort_group_22' ,
                      'Going to public places (cinemas, theaters, restaurants, gym).1': 'Comfort_go_public_22',
                      'Taking public transportation.1': 'Comfort_publictrans_22' ,
                      'Going to a crowded event outdoors.1': 'Comfort_event_outdoor_22' ,
                      'Do you feel you have gone back to your “normality”?': 'Feel_normal'
                      }, axis=1,inplace=True)

covid.rename({'What is your job sector?': 'Job_sector' , 'What is your major?': 'School_major' ,'What is your gender?': 'Gender',
              'How old are you now?': 'Age' , 'What is your ethnicity?': 'Ethnicity',
              'Which of these emotions do you associate with COVID-19?': 'Emotion'                    
                }, axis=1,inplace=True)

print(covid.columns[2:29])
print(covid.columns[41:93])
print(covid.columns[96:120])

Index(['Live_with_Family', 'Live_with_Friends', 'Live_with_Pets',
       'Live_with_Roommates', 'Live_with_Acquaintance', 'Live_Alone',
       'How many people did you live with in that period?',
       'Feel_During_Bored', 'Feel_During_Stressed', 'Feel_During_Free',
       'Feel_During_Optimistic', 'Feel_During_Peaceful', 'Feel_During_Lonely',
       'Feel_During_Disappointed', 'Feel_During_Frustrated',
       'Feel_During_Threatened', 'Before covid', 'During covid', 'Talk_Bored',
       'Talk_Stressed', 'Talk_Excited', 'Talk_Hopeful', 'Talk_Peaceful',
       'Talk_Lonely', 'Talk_Vulnerable', 'Talk_Disappointed', 'Talk_Anxious'],
      dtype='object')
Index(['Feel_novacc_Proud', 'Feel_novacc_Optimistic',
       'Feel_novacc_Indifferent', 'Feel_novacc_Vulnerable',
       'Feel_novacc_Disappointed', 'Feel_novacc_Hostile',
       'Feel_novacc_Threatened', 'Feel_novacc_Anxious', 'Feel_novacc_Free',
       'No', 'positive20', 'positive21', 'positive22', 'Feel_sick_Stressed',
       'Feel_sick_Optimistic', 'Feel_sick_Peace', 'Feel_sick_Lonely',
       'Feel_sick_Vulnerable', 'Feel_sick_Disappointed',
       'Feel_sick_Frustrated', 'Feel_sick_Anxious', 'Feel_sick_Threatened',
       'What were the priorities in your life *before* COVID-19?.1',
       'What were the priorities in your life *before* COVID-19?.2',
       'What were the priorities in your life *before* COVID-19?.3',
       'What were the priorities in your life *before* COVID-19?.4',
       'What were the priorities in your life *before* COVID-19?.5',
       'What were the priorities in your life *before* COVID-19?.6',
       'What were the priorities in your life *during* COVID-19?.1',
       'What were the priorities in your life *during* COVID-19?.2',
       'What were the priorities in your life *during* COVID-19?.3',
       'What were the priorities in your life *during* COVID-19?.4',
       'What were the priorities in your life *during* COVID-19?.5',
       'What were the priorities in your life *during* COVID-19?.6',
       'What is your main source of income?', 'Feel_study_Bored',
       'Feel_study_Stressed', 'Feel_study_Free', 'Feel_study_Proud',
       'Feel_study_Optimistic', 'Feel_study_Lonely', 'Feel_study_Disappointed',
       'Feel_study_Frustrated', 'Feel_study_Anxious',
       'Did your studying situation changed during COVID-19?',
       'Change in personal interests', 'Change in learning modes',
       'Family / Friends / Advisors Influence', 'Low grades',
       'Better Career Prospects', 'Economic Difficulties', 'Started working'],
      dtype='object')
Index(['Feel_profess_Free', 'Feel_profess_Aggressive',
       'Feel_profess_Frustrated', 'Feel_profess_Disappointed',
       'Feel_profess_Vulnerable', 'Feel_profess_Lonely',
       'Feel_profess_Optimistic', 'Feel_profess_Enthusiastic',
       'Feel_profess_Stressed', 'prof_situation_change',
       'Why did your professional situation change?', 'Comfort_group_21',
       'Comfort_go_public_21', 'Comfort_publictrans_21',
       'Comfort_event_outdoor_21', 'Comfort_group_22', 'Comfort_go_public_22',
       'Comfort_publictrans_22', 'Comfort_event_outdoor_22', 'Feel_normal',
       'Job_sector', 'School_major', 'Gender', 'Age'],
      dtype='object')

def count_nan_in_columns(df, start_col, end_col):
    num_nan = [df.iloc[:, i].isna().sum() for i in range(start_col, end_col)]
    return num_nan

num_nan_list1 = count_nan_in_columns(covid, 2, 28)
print(num_nan_list1)
num_nan_list2 = count_nan_in_columns(covid, 41, 93)
print(num_nan_list2)
num_nan_list3 = count_nan_in_columns(covid, 96, 104)
print(num_nan_list3)

[np.int64(417), np.int64(2729), np.int64(2607), np.int64(2764), np.int64(2976), np.int64(2799), np.int64(611), np.int64(1966), np.int64(2268), np.int64(2658), np.int64(2507), np.int64(1317), np.int64(2619), np.int64(2641), np.int64(2450), np.int64(2843), np.int64(0), np.int64(0), np.int64(2412), np.int64(2323), np.int64(2505), np.int64(2457), np.int64(2147), np.int64(2053), np.int64(2431), np.int64(2404)]
[np.int64(2900), np.int64(2880), np.int64(2402), np.int64(2715), np.int64(943), np.int64(2034), np.int64(2131), np.int64(2623), np.int64(2791), np.int64(1431), np.int64(2627), np.int64(2555), np.int64(2180), np.int64(2618), np.int64(2813), np.int64(2451), np.int64(2576), np.int64(2734), np.int64(2880), np.int64(2531), np.int64(2764), np.int64(2810), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(414), np.int64(2042), np.int64(1739), np.int64(2663), np.int64(2867), np.int64(2635), np.int64(2053), np.int64(2530), np.int64(1990), np.int64(2278), np.int64(361), np.int64(2913), np.int64(2912), np.int64(2961), np.int64(2971), np.int64(2916), np.int64(2964), np.int64(2973)]
[np.int64(2945), np.int64(2947), np.int64(2919), np.int64(2915), np.int64(2895), np.int64(2941), np.int64(2930), np.int64(2948)]

def replace_nan_with_yes_no(df, start_col=None, end_col=None):

    if start_col is None:
        start_col = 0  # Default to the first column
    if end_col is None:
        end_col = start_col  # If end_col isn't provided, modify only the start_col

    for col in range(start_col, end_col+1):
        col_name = df.columns[col]
        df[col_name].where(pd.isna(df[col_name]), 'Yes', inplace=True)   # Replace non-NaN values with 'Yes'
        df[col_name] = df[col_name].where(pd.notna(df[col_name]), 'No')  # Replace NaN values with 'No'

replace_nan_with_yes_no(covid, start_col=2, end_col=28)
replace_nan_with_yes_no(covid, start_col=41, end_col=93)
replace_nan_with_yes_no(covid, start_col=96 , end_col = 104)

def count_no_in_columns(df, start_col, end_col):
    
    num_no = [(df.iloc[:, i] == "No").sum() for i in range(start_col, end_col)]
    return num_no

num_no_list1 = count_no_in_columns(covid, 2, 28)
print(num_no_list1)
num_no_list2 = count_no_in_columns(covid, 41, 93)
print(num_no_list2)
num_no_list3 = count_no_in_columns(covid, 96, 104)
print(num_no_list3)

[np.int64(417), np.int64(2729), np.int64(2607), np.int64(2764), np.int64(2976), np.int64(2799), np.int64(611), np.int64(1966), np.int64(2268), np.int64(2658), np.int64(2507), np.int64(1317), np.int64(2619), np.int64(2641), np.int64(2450), np.int64(2843), np.int64(0), np.int64(0), np.int64(2412), np.int64(2323), np.int64(2505), np.int64(2457), np.int64(2147), np.int64(2053), np.int64(2431), np.int64(2404)]
[np.int64(2900), np.int64(2880), np.int64(2402), np.int64(2715), np.int64(943), np.int64(2034), np.int64(2131), np.int64(2623), np.int64(2791), np.int64(1431), np.int64(2627), np.int64(2555), np.int64(2180), np.int64(2618), np.int64(2813), np.int64(2451), np.int64(2576), np.int64(2734), np.int64(2880), np.int64(2531), np.int64(2764), np.int64(2810), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(0), np.int64(414), np.int64(2042), np.int64(1739), np.int64(2663), np.int64(2867), np.int64(2635), np.int64(2053), np.int64(2530), np.int64(1990), np.int64(2278), np.int64(361), np.int64(2913), np.int64(2912), np.int64(2961), np.int64(2971), np.int64(2916), np.int64(2964), np.int64(2973)]
[np.int64(2945), np.int64(2947), np.int64(2919), np.int64(2915), np.int64(2895), np.int64(2941), np.int64(2930), np.int64(2948)]

def compare_counts(num_no_list, num_nan_list):
    
    for i in range(len(num_no_list)):
        if num_no_list[i] != num_nan_list[i]:
            print("Mismatch", i)

print(compare_counts(num_no_list1, num_nan_list1))
print(compare_counts(num_no_list2, num_nan_list2))
print(compare_counts(num_no_list3, num_nan_list3))

None
None
None

covid.drop(['How many people did you live with in that period?',
            'Before covid', 'During covid',"How many people did you live with in that period?",
            'No','What were the priorities in your life *before* COVID-19?.1',
            'What were the priorities in your life *before* COVID-19?.2',
            'What were the priorities in your life *before* COVID-19?.3',
            'What were the priorities in your life *before* COVID-19?.4',
            'What were the priorities in your life *before* COVID-19?.5',
            'What were the priorities in your life *before* COVID-19?.6',
            'What were the priorities in your life *during* COVID-19?.1',
            'What were the priorities in your life *during* COVID-19?.2',
            'What were the priorities in your life *during* COVID-19?.3',
            'What were the priorities in your life *during* COVID-19?.4',
            'What were the priorities in your life *during* COVID-19?.5',
            'What were the priorities in your life *during* COVID-19?.6',
            'What is your main source of income?',
            'Did your studying situation changed during COVID-19?',
            'Change in personal interests','Change in learning modes',
            'Family / Friends / Advisors Influence','Low grades',
            'Better Career Prospects','Economic Difficulties','Started working',
            'Other / I prefer not to answer','How would you describe your job?',
           'In your daily job, do you work directly with the public?',
            'Why did your professional situation change?','Where did you grow up?'
           ], axis=1, inplace=True)

covidclean=pd.DataFrame(covid)
covidclean

covidclean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3005 entries, 0 to 3004
Data columns (total 93 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   City                       3005 non-null   object 
 1   Work_Study_Status          3005 non-null   object 
 2   Live_with_Family           3005 non-null   object 
 3   Live_with_Friends          3005 non-null   object 
 4   Live_with_Pets             3005 non-null   object 
 5   Live_with_Roommates        3005 non-null   object 
 6   Live_with_Acquaintance     3005 non-null   object 
 7   Live_Alone                 3005 non-null   object 
 8   Feel_During_Bored          3005 non-null   object 
 9   Feel_During_Stressed       3005 non-null   object 
 10  Feel_During_Free           3005 non-null   object 
 11  Feel_During_Optimistic     3005 non-null   object 
 12  Feel_During_Peaceful       3005 non-null   object 
 13  Feel_During_Lonely         3005 non-null   object 
 14  Feel_During_Disappointed   3005 non-null   object 
 15  Feel_During_Frustrated     3005 non-null   object 
 16  Feel_During_Threatened     3005 non-null   object 
 17  Talk_Bored                 3005 non-null   object 
 18  Talk_Stressed              3005 non-null   object 
 19  Talk_Excited               3005 non-null   object 
 20  Talk_Hopeful               3005 non-null   object 
 21  Talk_Peaceful              3005 non-null   object 
 22  Talk_Lonely                3005 non-null   object 
 23  Talk_Vulnerable            3005 non-null   object 
 24  Talk_Disappointed          3005 non-null   object 
 25  Talk_Anxious               3005 non-null   object 
 26  mask20                     2989 non-null   float64
 27  lock20                     2991 non-null   float64
 28  close20                    2990 non-null   float64
 29  vaccine20                  2959 non-null   float64
 30  distance20                 2947 non-null   float64
 31  quarent20                  2944 non-null   float64
 32  mask21                     2986 non-null   float64
 33  lock21                     2989 non-null   float64
 34  close21                    2980 non-null   float64
 35  vaccine21                  2960 non-null   float64
 36  distance21                 2959 non-null   float64
 37  quarent21                  2955 non-null   float64
 38  Feel_novacc_Proud          3005 non-null   object 
 39  Feel_novacc_Optimistic     3005 non-null   object 
 40  Feel_novacc_Indifferent    3005 non-null   object 
 41  Feel_novacc_Vulnerable     3005 non-null   object 
 42  Feel_novacc_Disappointed   3005 non-null   object 
 43  Feel_novacc_Hostile        3005 non-null   object 
 44  Feel_novacc_Threatened     3005 non-null   object 
 45  Feel_novacc_Anxious        3005 non-null   object 
 46  Feel_novacc_Free           3005 non-null   object 
 47  positive20                 3005 non-null   object 
 48  positive21                 3005 non-null   object 
 49  positive22                 3005 non-null   object 
 50  Feel_sick_Stressed         3005 non-null   object 
 51  Feel_sick_Optimistic       3005 non-null   object 
 52  Feel_sick_Peace            3005 non-null   object 
 53  Feel_sick_Lonely           3005 non-null   object 
 54  Feel_sick_Vulnerable       3005 non-null   object 
 55  Feel_sick_Disappointed     3005 non-null   object 
 56  Feel_sick_Frustrated       3005 non-null   object 
 57  Feel_sick_Anxious          3005 non-null   object 
 58  Feel_sick_Threatened       3005 non-null   object 
 59  Feel_study_Bored           3005 non-null   object 
 60  Feel_study_Stressed        3005 non-null   object 
 61  Feel_study_Free            3005 non-null   object 
 62  Feel_study_Proud           3005 non-null   object 
 63  Feel_study_Optimistic      3005 non-null   object 
 64  Feel_study_Lonely          3005 non-null   object 
 65  Feel_study_Disappointed    3005 non-null   object 
 66  Feel_study_Frustrated      3005 non-null   object 
 67  Feel_study_Anxious         3005 non-null   object 
 68  Feel_profess_Free          3005 non-null   object 
 69  Feel_profess_Aggressive    3005 non-null   object 
 70  Feel_profess_Frustrated    3005 non-null   object 
 71  Feel_profess_Disappointed  3005 non-null   object 
 72  Feel_profess_Vulnerable    3005 non-null   object 
 73  Feel_profess_Lonely        3005 non-null   object 
 74  Feel_profess_Optimistic    3005 non-null   object 
 75  Feel_profess_Enthusiastic  3005 non-null   object 
 76  Feel_profess_Stressed      3005 non-null   object 
 77  prof_situation_change      337 non-null    object 
 78  Comfort_group_21           3004 non-null   float64
 79  Comfort_go_public_21       3004 non-null   float64
 80  Comfort_publictrans_21     3004 non-null   float64
 81  Comfort_event_outdoor_21   3004 non-null   float64
 82  Comfort_group_22           3004 non-null   float64
 83  Comfort_go_public_22       3004 non-null   float64
 84  Comfort_publictrans_22     3004 non-null   float64
 85  Comfort_event_outdoor_22   3004 non-null   float64
 86  Feel_normal                3004 non-null   float64
 87  Job_sector                 360 non-null    object 
 88  School_major               2644 non-null   object 
 89  Gender                     3004 non-null   object 
 90  Age                        3004 non-null   object 
 91  Ethnicity                  3004 non-null   object 
 92  Emotion                    603 non-null    object 
dtypes: float64(21), object(72)
memory usage: 2.1+ MB

living = ['Live_with_Family', 'Live_with_Friends', 'Live_with_Pets', 'Live_with_Roommates', 
          'Live_with_Acquaintance']
 
feelDuring = ['Feel_During_Bored', 'Feel_During_Stressed', 'Feel_During_Free',
              'Feel_During_Optimistic', 'Feel_During_Peaceful', 'Feel_During_Lonely',
              'Feel_During_Disappointed', 'Feel_During_Frustrated',
              'Feel_During_Threatened']
living_Alone = ['Live_Alone']
feel_talking = ['Talk_Bored', 'Talk_Stressed', 'Talk_Excited', 'Talk_Hopeful', 'Talk_Peaceful',
                'Talk_Lonely', 'Talk_Vulnerable', 'Talk_Disappointed', 'Talk_Anxious']
 
living_situations = []
feelings = []
counts = []
response = {}
# Iterate over the DataFrame
for i in range(len(covidclean)):
    for living_condition in living:
        for feeling_condition in feelDuring:
            key = f"{living_condition}_{feeling_condition}"
            # Check if both conditions are "Yes"
            if covidclean[living_condition][i] == "Yes" and covidclean[feeling_condition][i] == "Yes":
                response[key] = response.get(key, 0) + 1
response = {}


for i in range(len(covidclean)):
    for living_condition in living:
        for feeling_condition in feelDuring:
            key = f"{living_condition}_{feeling_condition}"
            
            if covidclean[living_condition][i] == "Yes" and covidclean[feeling_condition][i] == "Yes":
                response[key] = response.get(key, 0) + 1


data = []
for key, count in response.items():
    parts = key.split('_')
    # Assume the last part of the split is always the feeling condition
    feeling = parts[-1]
    # And the rest of the parts form the living situation
    living_situation = '_'.join(parts[:-1])
    data.append({'Living Situation': living_situation, 'Feeling': feeling, 'Count': count})

df = pd.DataFrame(data)
pivot_table = df.pivot(index='Living Situation', columns='Feeling', values='Count').fillna(0)


stacked_data = {living_condition: [response.get(f"{living_condition}_{feeling}", 0) for feeling in feelDuring] for living_condition in living}

stacked_values = np.array(list(stacked_data.values()))
response_alone = {}
 
for i in range(len(covidclean)):
    for living_condition in living_Alone:  
        for talking_feeling_condition in feel_talking:
            key = f"{living_condition}_{talking_feeling_condition}"
            if covidclean[living_condition][i] == "Yes" and covidclean[talking_feeling_condition][i] == "Yes":
                response_alone[key] = response_alone.get(key, 0) + 1
response_flipped = {}
response = {}
 
for i in range(len(covidclean)):
    for living_condition in living:
        for feeling_condition in feelDuring:
            key = f"{living_condition}_{feeling_condition}"
            # Check if both conditions are "Yes"
            if covidclean[living_condition][i] == "Yes" and covidclean[feeling_condition][i] == "Yes":
                if key in response:
                    response[key] += 1
                else:
                    response[key] = 1
for feeling_condition in feelDuring:
    response_flipped[feeling_condition] = sum(response.get(f"{living_condition}_{feeling_condition}", 0) 
                                              for living_condition in living)
feelings =['Bored', 'Stressed', 'Peaceful','Lonely', 'Disappointed']
 
feel_talking_counts = [response_alone.get(f'Live_Alone_{feeling}', 0) for feeling in feel_talking]
 
values_alone = [response_alone.get(f'Live_Alone_Talk_{feeling}', 0) for feeling in feelings]
 
values_living = [sum(response.get(f'{condition}_Feel_During_{feeling}', 0) for condition in living) 
                 for feeling in feelings]
 
max_value_alone = max(values_alone)
max_value_living = max(values_living)
scale_factor = max_value_alone / max_value_living if max_value_living != 0 else 1
 
scaled_values_living = [round(value * scale_factor) for value in values_living]
 
x1 = np.arange(len(feel_talking)) 
width1 = 0.35  # the width of the bars
 
x = np.arange(len(feelings)) 
width = 0.35 
fig, ax = plt.subplots(2, 2, figsize=(20, 15))
 

pivot_table = df.pivot(index='Living Situation', columns='Feeling', values='Count')
 
# Create the heatmap
sns.heatmap(data=pivot_table, ax=ax[0, 0], annot=True, fmt="d", cmap='YlOrRd')
ax[0,0].set_xticklabels(['Bored','Disappointed', 'Free','Frustrated','Lonely',
              'Optimistic', 'Peaceful','Stressed',
              'Threatened'], rotation=45)
ax[0,0].set_yticklabels(['Living with Acquaintance', 'Living with Family', 'Living with Friends', 
                         'Living with Pets', 'Living with Roommates'])
ax[0,0].set_title('Living Situations and Feelings')
ax[0,0].set_xlabel("", rotation=45)
#plot 2 
bottom_values = np.zeros(len(feelDuring))

for i, (living_condition, values) in enumerate(stacked_data.items()):
    ax[0, 1].bar(feelDuring, values, bottom=bottom_values, label=living_condition)
    bottom_values += values

ax[0, 1].set_ylabel('Counts')
ax[0, 1].set_title('Living Situations and Feelings')
ax[0, 1].set_xticks(range(len(feelDuring)))
ax[0, 1].set_xticklabels(['Bored', 'Stressed', 'Free', 'Optimistic', 'Peaceful',
                          'Lonely', 'Disappointed', 'Frustrated', 'Threatened'], rotation=45)

# Adjusting legend labels
legend = ax[0, 1].legend()
legend.texts[0].set_text('Living with Family')
legend.texts[1].set_text('Living with Friends')
legend.texts[2].set_text('Living with Pets')
legend.texts[3].set_text('Living with Roommates')
legend.texts[4].set_text('Living with Acquaintance')
# Plot 3 
rects = ax[1,0].bar(x1, feel_talking_counts, width1)
ax[1,0].set_ylabel('Counts')
ax[1,0].set_title(' Feelings when Talking to Friends and Family While Living Alone')
ax[1,0].set_xticks(x1)
ax[1,0].set_xticklabels(['Lonely','Bored','Peaceful','Stressed','Anxious','Vulnerable'
,'Disappointed','Hopeful','Excited'], rotation=45)
 
for rect in rects:
    height = rect.get_height()
    ax[1,0].annotate('{}'.format(height),
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom')
#plot 4
rects1 = ax[1,1].bar(x - width/2, values_alone, width, label='Living Alone')
rects2 = ax[1,1].bar(x + width/2, scaled_values_living, width, label='Combined Living Situations')
ax[1,1].set_title("Different Living Situations: A Comparison of Feelings")
ax[1,1].set_xticks(x)
ax[1,1].set_xticklabels(feelings, rotation=45)
ax[1,1].set_yticks([])
legend = ax[1,1].legend()
legend.texts[0].set_text('Living Alone')
legend.texts[1].set_text('Living with People Scaled')
#ax[1,1].subplots_adjust(hspace=1) 

plt.tight_layout()
 
plt.show()

fig, ax = plt.subplots(figsize=(18, 10))
sns.set(font_scale=1.2)
pivot_table = df.pivot(index='Living Situation', columns='Feeling', values='Count')
sns.heatmap(data=pivot_table, annot=True, fmt="d", cmap='YlOrRd')
ax.set_xticklabels(['Bored','Disappointed', 'Free','Frustrated','Lonely',
              'Optimistic', 'Peaceful','Stressed',
              'Threatened'], rotation=45,size=15)
ax.set_yticklabels(['Living with Acquaintance', 'Living with Family', 'Living with Friends', 
                         'Living with Pets', 'Living with Roommates'],size=15)
ax.set_title('Living Situations and Feelings')
ax.title.set_size(20)
ax.set_xlabel("", rotation=45)
plt.tight_layout()
 
plt.show()

#plot 2 
feelDuring = ['Feel_During_Bored', 'Feel_During_Stressed', 'Feel_During_Free',
              'Feel_During_Optimistic', 'Feel_During_Peaceful', 'Feel_During_Lonely',
              'Feel_During_Disappointed', 'Feel_During_Frustrated',
              'Feel_During_Threatened']
fig, ax = plt.subplots(figsize=(18, 10))
sns.set(font_scale=1.2)
bottom_values = np.zeros(len(feelDuring))

for i, (living_condition, values) in enumerate(stacked_data.items()):
    ax.bar(feelDuring, values, bottom=bottom_values, label=living_condition)
    bottom_values += values

ax.set_ylabel('Counts',size=15)
ax.set_title('Living Situations and Feelings')
ax.set_xticks(range(len(feelDuring)))
ax.set_xticklabels(['Bored', 'Stressed', 'Free', 'Optimistic', 'Peaceful',
                          'Lonely', 'Disappointed', 'Frustrated', 'Threatened'], rotation=45,size=15)
plt.yticks(fontsize=15)
# Adjusting legend labels
legend = ax.legend()
legend.texts[0].set_text('Living with Family')
legend.texts[1].set_text('Living with Friends')
legend.texts[2].set_text('Living with Pets')
legend.texts[3].set_text('Living with Roommates')
legend.texts[4].set_text('Living with Acquaintance')

plt.tight_layout()
 
plt.show()

# Plot 3 
fig, ax = plt.subplots(figsize=(18, 10))
sns.set(font_scale=1.2)
rects = ax.bar(x1, feel_talking_counts, width1)
ax.set_ylabel('Counts')
ax.set_title(' Feelings when Talking to Friends and Family While Living Alone')
ax.set_xticks(x1)
ax.set_xticklabels(['Lonely','Bored','Peaceful','Stressed','Anxious','Vulnerable'
,'Disappointed','Hopeful','Excited'], rotation=45)
 
for rect in rects:
    height = rect.get_height()
    ax.annotate('{}'.format(height),
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom')

print(pivot_table)
result = list(zip(feelDuring, bottom_values))
print(result)

Feeling                             Bored  Disappointed  Free  Frustrated  \
Living Situation                                                            
Live_with_Acquaintance_Feel_During     10             5     7           8   
Live_with_Family_Feel_During          958           315   267         494   
Live_with_Friends_Feel_During          88            35    69          44   
Live_with_Pets_Feel_During            135            55    65          82   
Live_with_Roommates_Feel_During        67            33    59          41   

Feeling                             Lonely  Optimistic  Peaceful  Stressed  \
Living Situation                                                             
Live_with_Acquaintance_Feel_During       9           4         7         7   
Live_with_Family_Feel_During           330         422      1555       676   
Live_with_Friends_Feel_During           37          65       156        43   
Live_with_Pets_Feel_During              55          86       210       102   
Live_with_Roommates_Feel_During         36          59       115        60   

Feeling                             Threatened  
Living Situation                                
Live_with_Acquaintance_Feel_During          10  
Live_with_Family_Feel_During               112  
Live_with_Friends_Feel_During               17  
Live_with_Pets_Feel_During                  37  
Live_with_Roommates_Feel_During             29  
[('Feel_During_Bored', np.float64(1258.0)), ('Feel_During_Stressed', np.float64(888.0)), ('Feel_During_Free', np.float64(467.0)), ('Feel_During_Optimistic', np.float64(636.0)), ('Feel_During_Peaceful', np.float64(2043.0)), ('Feel_During_Lonely', np.float64(467.0)), ('Feel_During_Disappointed', np.float64(443.0)), ('Feel_During_Frustrated', np.float64(669.0)), ('Feel_During_Threatened', np.float64(205.0))]

covidclean_b = covidclean.dropna(subset=["Ethnicity"]).reset_index(drop=True)
living = ['Live_with_Family', 'Live_with_Friends', 'Live_with_Pets', 'Live_with_Roommates', 
          'Live_with_Acquaintance']
response = {}
for i, row in covidclean_b.iterrows():
    ethnic_group = row['Ethnicity']
    
    if any(row[living_condition] == "Yes" for living_condition in living):
        combined_living = "Living_with_Others"  

        for feeling_condition in feelDuring:
            key = f"{combined_living}_{feeling_condition}_{ethnic_group}"
            if row[feeling_condition] == "Yes":
                response[key] = response.get(key, 0) + 1
stacked_data = {"Living_with_Others": {}}

for ethnic_group in set(covidclean_b['Ethnicity']):
    stacked_data["Living_with_Others"][ethnic_group] = [response.get(f"Living_with_Others_{feeling}_{ethnic_group}", 0) 
                                                        for feeling in feelDuring]

fig, ax = plt.subplots(figsize=(18, 10))

bottom_values = np.zeros(len(feelDuring)) 
for i, (combined_living, ethnic_data) in enumerate(stacked_data.items()):
    for j, (ethnic_group, values) in enumerate(ethnic_data.items()):
        ax.bar(feelDuring, values, bottom=bottom_values, label=f"{combined_living} - {ethnic_group}" if i == 0 else "")
        bottom_values += values  
        
ax.set_ylabel('Counts')
ax.set_title('Living with People, Feelings, and Ethnicity')
ax.set_xticks(range(len(feelDuring)))
ax.set_xticklabels(['Bored','Stressed', 'Free','Optimistic', 'Peaceful','Lonely','Disappointed','Frustrated','Threatened'], rotation=45)
legend = ax.legend()
legend.texts[0].set_text('Living with People and White')
legend.texts[1].set_text('Living with People and Black or African American')
legend.texts[2].set_text('Living with People and Other / I prefer not to answer')
legend.texts[3].set_text('Living with People and Hispanic / Latino')
legend.texts[4].set_text('Living with People and Multiracial')
legend.texts[5].set_text('Living with People and Asian')

plt.tight_layout()
plt.show()

scale_mapping = {
    1: 'Strongly Disagree',
    2: 'Disagree',
    3: 'Neutral',
    4: 'Agree',
    5: 'Strongly Agree'
}

sorted_categories = ['mask20', 'mask21', 'lock20', 'lock21', 'close20', 'close21', 
                     'vaccine20', 'vaccine21', 'distance20', 'distance21', 'quarent20', 'quarent21']

df_sorted = covidclean[sorted_categories].applymap(lambda x: scale_mapping.get(x))

response_counts_heatmap = pd.DataFrame(index=sorted_categories, columns=scale_mapping.values())
for column in sorted_categories:
    counts = df_sorted[column].value_counts()
    for response in response_counts_heatmap.columns:
        response_counts_heatmap.at[column, response] = counts.get(response, 0)

response_counts_heatmap = response_counts_heatmap.apply(pd.to_numeric)

response_counts_bar = response_counts_heatmap.copy()
response_counts_heatmap = response_counts_heatmap.iloc[::-1]
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(24, 10))
sns.set(font_scale=1.2)
sns.heatmap(response_counts_heatmap, annot=True, cmap='Purples', fmt='d', ax=ax[0])
ax[0].set_title('Distribution for COVID-Related Activites')
ax[0].set_xlabel('Responses')
ax[0].set_ylabel('Covid Related Activites')
ax[0].set_yticklabels(["Quarantine 2021","Quarantine 2020","Social Distance 2021","Social Distance 2020"," Vaccination Pass 2021",
                       "Vaccination Pass 2020","Closing work in University or Work 2021",
                       "Closing work in University or Work 2020","Lockdown 2021","Lockdown 2020",
                       "Mask Mandate in 2021","Mask Mandate in 2020"],fontsize=10)

colors = ['#d9534f', '#f9c6c4', '#d3d3d3', '#5bc0de', '#337ab7']
left_starts = -response_counts_bar['Disagree'] - response_counts_bar['Strongly Disagree']
for response, color in zip(response_counts_bar.columns, colors):
    ax[1].barh(response_counts_bar.index, response_counts_bar[response], left=left_starts, color=color, label=response)
    left_starts += response_counts_bar[response]

ax[1].spines['left'].set_visible(False)
ax[1].spines['bottom'].set_visible(False)
ax[1].spines['top'].set_visible(False)
ax[1].spines['right'].set_visible(False)
ax[1].xaxis.set_major_formatter(plt.NullFormatter())
ax[1].legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=len(response_counts_bar.columns), frameon=False)
ax[1].set_title('Opinions on Covid Related Activites', fontsize=16, fontweight='bold', position=(0.5, 1.1))
ax[1].set_yticks(range(len(sorted_categories)))
ax[1].set_yticklabels(["Mask Mandate in 2020","Mask Mandate in 2021","Lockdown 2020","Lockdown 2021"
                       ,"Closing work in University or Work 2020",
                       "Closing work in University or Work 2021","Vaccination Pass 2020",
                      " Vaccination Pass 2021","Social Distance 2020","Social Distance 2021","Quarantine 2020","Quarantine 2021"], fontsize=10)
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(24, 10))
sns.set(font_scale=1.5)
sns.heatmap(response_counts_heatmap, annot=True, cmap='Purples', fmt='d')
ax.set_title('Distribution for COVID-Related Activites',fontsize=20)
ax.set_ylabel('Covid Related Activites',fontsize=20)
ax.set_yticklabels(["Quarantine 2021","Quarantine 2020","Social Distance 2021","Social Distance 2020"," Vaccination Pass 2021",
                       "Vaccination Pass 2020","Closing work in University or Work 2021",
                       "Closing work in University or Work 2020","Lockdown 2021","Lockdown 2020",
                       "Mask Mandate in 2021","Mask Mandate in 2020"],fontsize=15)
ax.set_xticklabels(["Strongly Disagree","Disagree","Neutral Responses","Agree","Strongly Agree"],size=15)
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(24, 10))
colors = ['#d9534f', '#f9c6c4', '#d3d3d3', '#5bc0de', '#337ab7']
left_starts = -response_counts_bar['Disagree'] - response_counts_bar['Strongly Disagree']
for response, color in zip(response_counts_bar.columns, colors):
    ax.barh(response_counts_bar.index, response_counts_bar[response], left=left_starts, color=color, label=response)
    left_starts += response_counts_bar[response]

ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=len(response_counts_bar.columns), frameon=False)
ax.set_title('Opinions on Covid Related Activites', fontsize=16, fontweight='bold', position=(0.5, 1.1))
ax.set_yticks(range(len(sorted_categories)))
ax.set_yticklabels(["Mask Mandate in 2020","Mask Mandate in 2021","Lockdown 2020","Lockdown 2021"
                       ,"Closing work in University or Work 2020",
                       "Closing work in University or Work 2021","Vaccination Pass 2020",
                      " Vaccination Pass 2021","Social Distance 2020","Social Distance 2021","Quarantine 2020","Quarantine 2021"], fontsize=15)
plt.tight_layout()
plt.show()

covidclean = covidclean[covidclean['Age'] != 'I prefer not to answer']
covidclean = covidclean[covidclean['Gender'] != 'Not-binary']
covidclean = covidclean[covidclean['Gender'] != 'Other / I prefer not to answer']
def calculate_average_agreement(df, group_col, categories):
    return df.groupby(group_col)[categories].mean()


categories = ['mask20', 'mask21', 'lock20', 'lock21', 'close20', 'close21', 
              'vaccine20', 'vaccine21', 'distance20', 'distance21',  'quarent20', 'quarent21']


age_avg = calculate_average_agreement(covidclean, 'Age', categories)
gender_avg = calculate_average_agreement(covidclean, 'Gender', categories)
ethnicity_avg = calculate_average_agreement(covidclean, 'Ethnicity', categories)

age_avg_T = age_avg.T
gender_avg_T = gender_avg.T
ethnicity_avg_T = ethnicity_avg.T

fig, axes = plt.subplots(3, 1, figsize=(30, 25), sharex=True)

# Age group plot
age_avg_T.plot(kind='bar', ax=axes[0], colormap='viridis')
axes[0].set_title('Average Agreement Level by Age Group')
axes[0].set_ylabel('Average Agreement Level')
axes[0].legend(title='Age Groups', loc='upper right')

# Gender plot
gender_avg_T.plot(kind='bar', ax=axes[1], colormap='coolwarm')
axes[1].set_title('Average Agreement Level by Gender')
axes[1].set_ylabel('Average Agreement Level')
axes[1].legend(title='Gender', loc='upper right')

# Ethnicity plot
ethnicity_avg_T.plot(kind='bar', ax=axes[2], colormap='plasma')
axes[2].set_title('Average Agreement Level by Ethnicity')
axes[2].set_ylabel('Average Agreement Level')
axes[2].legend(title='Ethnicity', loc='upper right')
axes[2].set_xticklabels(["Mask Mandate in 2020","Mask Mandate in 2021","Lockdown 2020","Lockdown 2021"
                      ,"Closing work in University or Work 2020",
                     "Closing work in University or Work 2021","Vaccination Pass 2020",
                     " Vaccination Pass 2021","Social Distance 2020","Social Distance 2021","Quarantine 2020","Quarantine 2021"], rotation=45,fontsize=17)

plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Feeling_NoVac = ['Feel_novacc_Proud', 'Feel_novacc_Optimistic', 'Feel_novacc_Indifferent', 
                 'Feel_novacc_Vulnerable', 'Feel_novacc_Disappointed', 'Feel_novacc_Hostile', 
                 'Feel_novacc_Threatened', 'Feel_novacc_Anxious', 'Feel_novacc_Free']
feel_labels = ["Proud", "Optimistic", "Indifferent", "Vulnerable", 
                 "Disappointed", "Hostile", "Threatened", "Anxious", "Free"]


colors = ["lightgray", "slategray", "teal", "cadetblue", "steelblue", "midnightblue", "thistle", "plum", "darkmagenta"]

age_group_counts = pd.DataFrame()
gender_group_counts = pd.DataFrame()

for feeling in Feeling_NoVac:
    counts_by_age = covidclean.groupby('Age')[feeling].apply(lambda x: (x == 'Yes').sum())
    age_group_counts[feeling] = counts_by_age
    counts_by_gender = covidclean.groupby('Gender')[feeling].apply(lambda x: (x == 'Yes').sum())
    gender_group_counts[feeling] = counts_by_gender

overall_counts = [covidclean[feeling].value_counts().get('Yes', 0) for feeling in Feeling_NoVac]


fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(32, 8))
axes[0].bar(Feeling_NoVac, overall_counts)
axes[0].set_ylabel('Count of Yes Responses')
axes[0].set_title('Overall Emotions')
axes[0].tick_params(axis='x', rotation=45)
axes[0].set_xticks(range(len(["Proud","Optimistic","Indifferent","Vulnerable","Disappointed","Hostile","Threatened","Anxious","Free"])))
axes[0].set_xticklabels(["Proud","Optimistic","Indifferent","Vulnerable","Disappointed","Hostile","Threatened","Anxious","Free"], rotation=45)

bottom_value_age = pd.Series([0]*len(age_group_counts), index=age_group_counts.index)
for i, feeling in enumerate(Feeling_NoVac):
    axes[1].bar(age_group_counts.index, age_group_counts[feeling], bottom=bottom_value_age, color=colors[i], label=feeling)
    bottom_value_age += age_group_counts[feeling]

axes[1].set_xlabel('Age Groups')
axes[1].set_title('Emotions by Age Group')
axes[1].tick_params(axis='x', rotation=45)
legend = axes[1].legend()
for i, feeling in enumerate(Feeling_NoVac):
    if i < len(legend.texts):
        legend.texts[i].set_text(feel_labels[i])

bottom_value_gender = pd.Series([0]*len(gender_group_counts), index=gender_group_counts.index)
for i, feeling in enumerate(Feeling_NoVac):
    axes[2].bar(gender_group_counts.index, gender_group_counts[feeling], bottom=bottom_value_gender, color=colors[i], label=feeling)
    bottom_value_gender += gender_group_counts[feeling]

axes[2].set_xlabel('Gender')
axes[2].set_title('Emotions by Gender')
axes[2].tick_params(axis='x', rotation=45)
legend = axes[2].legend()
for i, feeling in enumerate(Feeling_NoVac):
    if i < len(legend.texts):
        legend.texts[i].set_text(feel_labels[i])

# Convert the DataFrame to a list of tuples (feeling, female_count, male_count)
gender_counts_list = [(feeling, 
                       gender_group_counts.loc['Female', feeling], 
                       gender_group_counts.loc['Male', feeling]) 
                      for feeling in Feeling_NoVac]

#print(gender_counts_list)

age_counts_list = [(age_group, 
                    age_group_counts.loc[age_group].to_dict()) 
                   for age_group in age_group_counts.index]

#print(age_counts_list)

Feeling_NoVac = ['Feel_novacc_Proud', 'Feel_novacc_Optimistic', 'Feel_novacc_Indifferent', 
                 'Feel_novacc_Vulnerable', 'Feel_novacc_Disappointed', 'Feel_novacc_Hostile', 
                 'Feel_novacc_Threatened', 'Feel_novacc_Anxious', 'Feel_novacc_Free']
feel_labels = ["Proud", "Optimistic", "Indifferent", "Vulnerable", 
                 "Disappointed", "Hostile", "Threatened", "Anxious", "Free"]


colors = ["lightgray", "slategray", "teal", "cadetblue", "steelblue", "midnightblue", "thistle", "plum", "darkmagenta"]

age_group_counts = pd.DataFrame()
gender_group_counts = pd.DataFrame()

for feeling in Feeling_NoVac:
    counts_by_age = covidclean.groupby('Age')[feeling].apply(lambda x: (x == 'Yes').sum())
    age_group_counts[feeling] = counts_by_age
    counts_by_gender = covidclean.groupby('Gender')[feeling].apply(lambda x: (x == 'Yes').sum())
    gender_group_counts[feeling] = counts_by_gender

overall_counts = [covidclean[feeling].value_counts().get('Yes', 0) for feeling in Feeling_NoVac]


fig, ax = plt.subplots(figsize=(32, 8))
ax.bar(Feeling_NoVac, overall_counts)
ax.set_ylabel('Count of Yes Responses')
ax.set_title('Overall Emotions')
ax.tick_params(axis='x', rotation=45)
ax.set_xticks(range(len(["Proud","Optimistic","Indifferent","Vulnerable","Disappointed","Hostile","Threatened","Anxious","Free"])))
ax.set_xticklabels(["Proud","Optimistic","Indifferent","Vulnerable","Disappointed","Hostile","Threatened","Anxious","Free"], rotation=45,size=17)

[Text(0, 0, 'Proud'),
 Text(1, 0, 'Optimistic'),
 Text(2, 0, 'Indifferent'),
 Text(3, 0, 'Vulnerable'),
 Text(4, 0, 'Disappointed'),
 Text(5, 0, 'Hostile'),
 Text(6, 0, 'Threatened'),
 Text(7, 0, 'Anxious'),
 Text(8, 0, 'Free')]

fig, ax = plt.subplots(nrows=1,ncols=2,figsize=(32, 8))
bottom_value_age = pd.Series([0]*len(age_group_counts), index=age_group_counts.index)
for i, feeling in enumerate(Feeling_NoVac):
    ax[0].bar(age_group_counts.index, age_group_counts[feeling], bottom=bottom_value_age, color=colors[i], label=feeling)
    bottom_value_age += age_group_counts[feeling]

ax[0].set_xlabel('Age Groups')
ax[0].set_title('Emotions by Age Group')
ax[0].tick_params(axis='x', rotation=45)
legend = ax[0].legend()
for i, feeling in enumerate(Feeling_NoVac):
    if i < len(legend.texts):
        legend.texts[i].set_text(feel_labels[i])

bottom_value_gender = pd.Series([0]*len(gender_group_counts), index=gender_group_counts.index)
for i, feeling in enumerate(Feeling_NoVac):
    ax[1].bar(gender_group_counts.index, gender_group_counts[feeling], bottom=bottom_value_gender, color=colors[i], label=feeling)
    bottom_value_gender += gender_group_counts[feeling]

ax[1].set_xlabel('Gender')
ax[1].set_title('Emotions by Gender')
ax[1].tick_params(axis='x', rotation=45)
legend = axes[2].legend()
for i, feeling in enumerate(Feeling_NoVac):
    if i < len(legend.texts):
        legend.texts[i].set_text(feel_labels[i])

scale_mapping = {
    1: 'Strongly Disagree',
    2: 'Disagree',
    3: 'Neutral',
    4: 'Agree',
    5: 'Strongly Agree'
}

categories = ['vaccine20', 'vaccine21']
df_mapped = covidclean[categories].applymap(lambda x: scale_mapping.get(x, x))

df_milan = df_mapped[covidclean['City'] == "Milan"]
df_new_york = df_mapped[covidclean['City'] == "New York"]

counts_milan = df_milan.apply(pd.Series.value_counts).fillna(0).astype(int)
counts_new_york = df_new_york.apply(pd.Series.value_counts).fillna(0).astype(int)

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 8), sharey=False)
# Loop through each category and create subplots
for j, category in enumerate(categories):
    # Reindex to ensure all response categories are present
    counts_milan_category = counts_milan[category].reindex(scale_mapping.values(), fill_value=0)
    counts_new_york_category = counts_new_york[category].reindex(scale_mapping.values(), fill_value=0)

    # Plot for New York
    axes[0, j].bar(counts_new_york_category.index, counts_new_york_category.values)
    axes[0, j].set_title(f'New York - {category}')
    axes[0, j].set_xticklabels(counts_new_york_category.index, rotation=45)
    axes[0, j].tick_params(axis='y')  # Allow automatic y-axis ticks

    # Plot for Milan
    axes[1, j].bar(counts_milan_category.index, counts_milan_category.values, color="orange")
    axes[1, j].set_title(f'Milan - {category}')
    axes[1, j].set_xticklabels(counts_milan_category.index, rotation=45)
    axes[1, j].tick_params(axis='y')  # Allow automatic y-axis ticks

# Adjust layout
plt.tight_layout()
plt.show()

    
# Adjust layout
plt.tight_layout()
plt.show()

<Figure size 640x480 with 0 Axes>

count_positive = ['positive20', 'positive21', 'positive22']
count_yes_positive = covidclean[count_positive].eq('Yes').sum()

# Calculate the total count of positive cases
total_positive = count_yes_positive.sum()

# Calculate the percentages
percentages = (count_yes_positive / total_positive) * 100

# Create a table
test_positive = pd.DataFrame({
    'Total count positive': count_yes_positive,
    'In % of total': percentages.round(2)
})

test_positive

count_positive = ['positive20','positive21','positive22']

count_yes_positive = covidclean[count_positive].eq('Yes').sum()

# Create a table
test_positive = pd.DataFrame({'Count of "Yes" to positive': count_yes_positive})

#Plot
test_positive.index = ['2020', '2021', '2022']
ax = test_positive.plot(kind='bar', color='royalblue')
ax.set_xticklabels(ax.get_xticklabels(), rotation=0, ha='right')

plt.xlabel('Years')
plt.ylabel('Number of test positive')
plt.title('Test positive for Covid-19')
plt.grid(False)
ax.get_legend().remove()
plt.show()

# Convert the 'Age' column to a categorical type with the proper order
age_order = ['18-20', '20-24', '24-30', '30-40', '40-50', '50-60', '60+']
covidclean['Age'] = pd.Categorical(covidclean['Age'], categories=age_order, ordered=True)

# Filter out the 'I prefer not to answer' entries
cc_filtered = covidclean[(covidclean['Age'] != 'I prefer not to answer')]

# Count the occurrences of "Yes" for each positive test column
count_positive = ['positive20', 'positive21', 'positive22']
count_yes_positive = covidclean[count_positive].eq('Yes').sum()

# Filter the DataFrame for "Female" and "Male" genders
filtered_Age = covidclean[covidclean['Age'].isin(age_order)]

# Group by 'Age' and calculate the sum of "Yes" for each positive test column
count_yes_positive_age = (filtered_Age.groupby('Age')[count_positive]
                             .apply(lambda x: x.eq('Yes').sum()))


# Convert the result to a DataFrame for better visualization
count_yes_df = count_yes_positive_age.reset_index().rename(columns={col: f'Count of Yes for {col}' for col in count_positive})

# Plot using seaborn
plt.figure(figsize=(10, 6))
sns.barplot(x='Age', y='value', hue='variable', data=pd.melt(count_yes_df, id_vars=['Age']))
plt.title('Test positive for Covid-19 by Age')
plt.xlabel('Age Group')
plt.ylabel('Total Number of test positive')
plt.legend(title='Positive Test Column', bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(axis='y', alpha=0.4)
plt.show()

work_study_counts = covidclean['Work_Study_Status'].value_counts()

# Create a table
work_study_table = pd.DataFrame({'Count': work_study_counts})
total_count = work_study_table['Count'].sum()

# Calculate the percentage
work_study_table['In % of total'] = (work_study_table['Count'] / total_count) * 100
work_study_table

count_positive = ['positive20', 'positive21', 'positive22']

# Count the occurrences of "Yes" for each positive test column
count_yes_positive = covidclean[count_positive].eq('Yes').sum()

# Calculate the total count of positive cases
total_positive = count_yes_positive.sum()

# Count the occurrences of each work and study status
work_study_counts = covidclean['Work_Study_Status'].value_counts()

# Create a table for work and study status counts
work_study_table = pd.DataFrame({'Count': work_study_counts})

# Calculate the total count for all work and study statuses
total_count = work_study_table['Count'].sum()

# Create a new DataFrame to store the counts of "Yes" for each work and study status
work_study_positive_counts = pd.DataFrame(index=work_study_table.index)
work_study_positive_counts

# Iterate through positive test columns and count "Yes" for each work and study status
for column in count_positive:
    # Get the base column name without "_Yes" suffix
    base_column_name = column.replace('_Yes', '')
    # Rename the column with "In %" prefix
    work_study_positive_counts[f'In % {base_column_name}'] = covidclean[covidclean[column] == 'Yes']['Work_Study_Status'].value_counts()

# Replace NaN with 0 for cases where there are no "Yes" for a particular positive test column
work_study_positive_counts = work_study_positive_counts.fillna(0)

# Normalize the counts to percentages
work_study_positive_percentage = (work_study_positive_counts / total_count) * 100
work_study_positive_percentage = work_study_positive_percentage.round(2)
work_study_positive_percentage

# Plot a stacked bar chart 
ax = work_study_positive_percentage.plot(kind='bar', stacked=True, figsize=(8, 6), colormap='tab20c')
plt.title('Positive Test Results by Study Work status')
plt.xlabel('Work_Study_Status')
plt.ylabel('Percentage')
plt.legend(title='Positive Test Columns')

# Add values inside the bar for 'Study' only
for index, row in enumerate(work_study_positive_percentage.iterrows()):
    study_percent20 = row[1]['In % positive20']
    study_percent21 = row[1]['In % positive21']
    study_percent22 = row[1]['In % positive22']

    # Check if the current row corresponds to 'Study'
    if row[0] == 'Study':
        # Display percentage values inside the bar
        ax.text(index, study_percent20 / 2, f'{study_percent20:.2f}%', ha='center', va='center', fontweight='bold')
        ax.text(index, study_percent20 + study_percent21 / 2, f'{study_percent21:.2f}%', ha='center', va='center', fontweight='bold')
        ax.text(index, study_percent20 + study_percent21 + study_percent22 / 2, f'{study_percent22:.2f}%', ha='center', va='center', fontweight='bold')

ax.set_yticks(np.arange(0, 101, 25))
ax.set_yticklabels([f'{y}%' for y in np.arange(0, 101, 25)])
ax.set_xticklabels(work_study_positive_percentage.index, rotation=0)
ax.grid(False)
plt.show()

# Count the occurrences of "Yes" for each positive test column
count_positive = ['positive20', 'positive21', 'positive22']
count_yes_positive = covidclean[count_positive].eq('Yes').sum()

# Filter the DataFrame for "Female" and "Male" genders
filtered_gender = covidclean[covidclean['Gender'].isin(['Female', 'Male'])]

# Group by 'Gender' and calculate the sum of "Yes" for each positive test column
count_yes_positive_gender = (filtered_gender.groupby('Gender')[count_positive]
                             .apply(lambda x: x.eq('Yes').sum()))

# Calculate the total count of positive cases
total_positive = count_yes_positive.sum()

# Create a DataFrame to store the counts by "Gender" and by each positive test column
result_table = pd.DataFrame({
    'For Male': count_yes_positive_gender.loc['Male'],
    'For Female': count_yes_positive_gender.loc['Female']
})

# Calculate the percentages
result_table['Male in % of Total'] = result_table['For Male'] / total_positive * 100
result_table['Female in % of Total'] = result_table['For Female'] / total_positive * 100
result_table

plot_data = result_table[['Male in % of Total', 'Female in % of Total']]

# Set up the bar plot
fig, ax = plt.subplots(figsize=(8, 6))
plot_data.plot(kind='bar', ax=ax, width=0.8, color=['blue', 'skyblue'])
ax.set_xticklabels(count_positive, rotation=0) 

# Set plot labels and title
ax.set_ylabel('Total positive test')
ax.set_title('Percentage of Total Positive Cases by Gender and Test')

# Add labels on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Set additional features
ax.set_title('Percentage of Positive Test by Gender')
ax.set_yticks(np.arange(0, 101, 20))
ax.set_yticklabels(['{:.0f}%'.format(x) for x in ax.get_yticks()])
ax.legend(title='Gender', loc='upper right', bbox_to_anchor=(1, 1))
ax.grid(False)
plt.show()

count_positive = ['positive20', 'positive21', 'positive22']

# Count the occurrences of "Yes" for each category
count_once = ((covidclean['positive20'] == 'Yes') |
              (covidclean['positive21'] == 'Yes') |
              (covidclean['positive22'] == 'Yes')).sum()

count_twice = ((covidclean['positive20'] == 'Yes') & (covidclean['positive21'] == 'Yes') | 
               (covidclean['positive20'] == 'Yes') & (covidclean['positive22'] == 'Yes') | 
               (covidclean['positive21'] == 'Yes') & (covidclean['positive22'] == 'Yes')).sum()

count_thrice = ((covidclean['positive20'] == 'Yes') &
                (covidclean['positive21'] == 'Yes') &
                (covidclean['positive22'] == 'Yes')).sum()

# Calculate total count of positive cases
total_positive = count_once + count_twice + count_thrice

# Create a table
test_positive = pd.DataFrame({
    'Counts': [total_positive, count_once, count_twice, count_thrice],
    'In %': [100, count_once / total_positive * 100, count_twice / total_positive * 100, count_thrice / total_positive * 100]
}, index=['Total count of positive', 'Positive Once', 'Positive Twice', 'Positive Thrice'])

test_positive['In %'] = test_positive['In %'].round(2)
test_positive

#Feelings experienced upon testing positive of COVID-19
feel_sick = ['Feel_sick_Stressed', 'Feel_sick_Optimistic', 'Feel_sick_Peace', 'Feel_sick_Lonely', 'Feel_sick_Vulnerable',
             'Feel_sick_Disappointed', 'Feel_sick_Frustrated', 'Feel_sick_Anxious', 'Feel_sick_Threatened']

# Select rows where the test result is positive
positive_rows = covidclean[(covidclean['positive20'] == 'Yes') | (covidclean['positive21'] == 'Yes') | (covidclean['positive22'] == 'Yes')]

# Count the occurrences of "Yes" for each feeling category among positive cases
feel_sick_count_positive = positive_rows[feel_sick].eq('Yes').sum()

# Normalize data
feel_sick_percentage_positive = (feel_sick_count_positive / total_positive) * 100

# Create a table
feel_sick_analysis = pd.DataFrame({
    'Counts': feel_sick_count_positive,
    'In %': feel_sick_percentage_positive.round(2)         # Round to 2 decimal places
}, index=feel_sick)

feel_sick_analysis_sorted = feel_sick_analysis.sort_values(by='In %', ascending=False)
feel_sick_analysis_sorted

# Create a new figure with two subplots side by side
fig, axs = plt.subplots(1, 2, figsize=(20, 6))

# Plotting on the first subplot (Overall Test Positive on COVID-19)
in_percentage_sorted = test_positive['In %'].drop('Total count of positive').sort_values(ascending=False)
sns.barplot(x=in_percentage_sorted.values, y=in_percentage_sorted.index, palette=['lightseagreen'], ax=axs[0])

axs[0].xaxis.tick_top()        
axs[0].set_xticks(np.arange(0, 101, 20))  
axs[0].set_xticklabels([f'{x}%' for x in np.arange(0, 101, 20)])  
axs[0].set_title('Overall Test Positive on COVID-19')

## Plotting on the second subplot (Feelings experienced upon testing positive)
feel_sick_analysis_sorted = feel_sick_analysis.sort_values(by='In %', ascending=False)

# Visualization of the sorted data
sns.barplot(x='In %', y=feel_sick_analysis_sorted.index, data=feel_sick_analysis_sorted, 
            palette=['teal' if i < 3 else 'silver' for i in range(len(feel_sick_analysis_sorted))], 
            order=feel_sick_analysis_sorted.index, ax=axs[1])

# % on top
axs[1].xaxis.tick_top()        
axs[1].set_xticks(np.arange(0, 101, 20))  
axs[1].set_xticklabels([f'{x}%' for x in np.arange(0, 101, 20)])  

# % on bottom
axs[1].tick_params(axis='x', which='both', bottom=True, top=True)
axs[1].set_xticks(np.arange(0, 101, 20))
axs[1].set_xticklabels([f'{x}%' for x in np.arange(0, 101, 20)])

axs[1].set_xlabel('Percentage (%)')
axs[1].set_ylabel('Feelings experienced')
axs[1].set_title('Feelings experienced upon testing positive of COVID-19', pad=30)

# Adjust layout
plt.tight_layout(pad=2.0)
plt.show()

#from column "Work_Study_Status"
work_study_counts = covidclean['Work_Study_Status'].value_counts()
work_study_counts

Work_Study_Status
Study      2476
Work        334
Both        136
Neither      22
Name: count, dtype: int64

# Filter the DataFrame to include only "Study" and "Both" values in the 'Work_Study_Status' column
student = covidclean[covidclean['Work_Study_Status'].isin(['Study', 'Both'])]

# Count the occurrences of "Study" and "Both"
student_counts = student['Work_Study_Status'].value_counts()
student_counts

Work_Study_Status
Study    2476
Both      136
Name: count, dtype: int64

count_study = ['Feel_study_Bored', 'Feel_study_Stressed', 'Feel_study_Free', 'Feel_study_Proud', 'Feel_study_Optimistic',
               'Feel_study_Lonely', 'Feel_study_Disappointed', 'Feel_study_Frustrated', 'Feel_study_Anxious']

count_study_feel = covidclean[count_study].eq('Yes').sum()

# Create a table
study_feel = pd.DataFrame({'Count': count_study_feel})
study_feel = study_feel.sort_values(by='Count', ascending=False)

# Calculate the percentage and round to 2 decimal places
total_count = study_feel['Count'].sum()
study_feel['In % of total'] = (study_feel['Count'] / total_count) * 100
study_feel['In % of total'] = study_feel['In % of total'].round(2)
study_feel

#Checking on the individuals who selected more than one feeling 

covidclean['Total_Selected'] = covidclean[count_study].eq('Yes').sum(axis=1)

more_than_one_feel_count = (covidclean['Total_Selected'] > 1).sum()
more_than_one_feel_count

np.int64(2069)

# Remove the 'Total_Selected' column if you don't need it
covidclean.drop('Total_Selected', axis=1, inplace=True)

# Define positive and negative feelings
positive_feelings = ['Feel_study_Optimistic', 'Feel_study_Free', 'Feel_study_Proud']
negative_feelings = ['Feel_study_Stressed', 'Feel_study_Frustrated', 'Feel_study_Bored',
                     'Feel_study_Lonely', 'Feel_study_Anxious', 'Feel_study_Disappointed']

# Create a new column 'Feeling_Category'
#Initially, all rows in the 'Feeling_Category' column are set to 'Neutral' So, 'Neutral' is the default category, 
#and it is assigned when neither positive feelings('Positive') nor negative feelings('Negative') are indicated in the respective columns. 
covidclean['Feeling_Category'] = 'Neutral'
covidclean.loc[covidclean[positive_feelings].eq('Yes').any(axis=1), 'Feeling_Category'] = 'Positive'
covidclean.loc[covidclean[negative_feelings].eq('Yes').any(axis=1), 'Feeling_Category'] = 'Negative'

# Count the occurrences for each category
feeling_category_counts = covidclean['Feeling_Category'].value_counts()

# Create a table for counts
feeling_category_table = pd.DataFrame({'Count': feeling_category_counts})

# Calculate the percentage and round to 2 decimal places
total_count_category = feeling_category_table['Count'].sum()
feeling_category_table['In % of total'] = (feeling_category_table['Count'] / total_count_category) * 100
feeling_category_table['In % of total'] = feeling_category_table['In % of total'].round(2)

# Display the DataFrame
feeling_category_table

# Create a new figure with two subplots side by side
fig, axs = plt.subplots(1, 2, figsize=(20, 6))

# 1. Plotting on the left subplot
top3_feelings = study_feel.index[:3].tolist()
rest_feelings = study_feel.index[3:].tolist()

# Create a custom color list
custom_colors = ['darkslateblue' if feeling in top3_feelings else 'gray' for feeling in study_feel.index]
sns.barplot(x='In % of total', y=study_feel.index, data=study_feel, palette=custom_colors, ax=axs[0])

# Set x-axis ticks and labels
axs[0].set_xticks(np.arange(0, 61, 20))
axs[0].set_xticklabels([f'{x}%' for x in np.arange(0, 61, 20)])

# Set y-axis label and title
axs[0].set_ylabel('Feelings')
axs[0].set_title('Top 3 of Feelings about Studying During the Pandemic')
axs[0].grid(axis='x')

# 2. Plotting on the right subplot
ax2 = sns.barplot(x=feeling_category_table.index, y='In % of total', data=feeling_category_table, palette='viridis', ax=axs[1])
# axs[1].set_ylabel('Percentage of Total')
axs[1].set_xlabel('Feeling Category')
axs[1].set_title('Feelings Category for Studying During the Pandemic')
axs[1].set_yticks(range(0, 101, 20))
axs[1].set_yticklabels([f'{x}%' for x in np.arange(0, 101, 20)])
axs[1].set_ylim(0, 100)

# Add labels on top of each bar in the right subplot
for p in ax2.patches:
    ax2.annotate(f'{p.get_height():.2f}%', (p.get_x() + p.get_width() / 2., p.get_height()),
                ha='center', va='center', xytext=(0, 10), textcoords='offset points')

# Adjust layout and display the plot
plt.tight_layout()
plt.show()

count_professional = ['Feel_profess_Free','Feel_profess_Aggressive','Feel_profess_Frustrated','Feel_profess_Disappointed','Feel_profess_Vulnerable',
                       'Feel_profess_Lonely','Feel_profess_Optimistic','Feel_profess_Enthusiastic','Feel_profess_Stressed']

count_profess_feel = covidclean[count_professional].eq('Yes').sum()

# Create a table
profess_feel = pd.DataFrame({'Count': count_profess_feel})
profess_feel = profess_feel.sort_values(by='Count', ascending=False)

total_count_profess = profess_feel['Count'].sum()
profess_feel['In % of total'] = (profess_feel['Count'] / total_count_profess) * 100
profess_feel =  profess_feel.sort_values(by='In % of total', ascending=False)
profess_feel = profess_feel.round(2)
profess_feel

#From prof_situation_change column

prof_situation_change = covidclean['prof_situation_change'].value_counts()
prof_situation_change

prof_situation_change
No, it didn't change                                   97
Yes, I had a reduction in working hours and/or wage    57
Yes, due to smartworking                               45
Yes, my company went out of business                   33
Yes, I quit my job                                     30
Yes, my contract ended                                 30
None of the above                                      18
Yes, I was fired                                       14
Other / I prefer not to answer                         10
Name: count, dtype: int64

# Define the categories to be grouped under 'Yes_changed'
grouped_categories = [
    'Yes, I had a reduction in working hours and/or wage',
    'Yes, due to smartworking',
    'Yes, my company went out of business',
    'Yes, I quit my job',
    'Yes, my contract ended',
    'Yes, I was fired'
]

# Create a new column 'prof_situation_grouped'
covidclean['prof_situation_grouped'] = covidclean['prof_situation_change'].replace(grouped_categories, 'Yes_changed')

# Check unique values after replacement
#unique_values_after = covidclean['prof_situation_grouped'].unique()
#print("\nUnique values after replacement:")
#print(unique_values_after)

# Display the updated distribution
prof_situation_change = covidclean['prof_situation_grouped'].value_counts()
print(prof_situation_change)

# Calculate the distribution and percentages
prof_situation_change = covidclean['prof_situation_grouped'].value_counts()
total_count = prof_situation_change.sum()
prof_situation_change_percentage = (prof_situation_change / total_count) * 100
prof_situation_change_percentage.round(2)

prof_situation_grouped
Yes_changed                       209
No, it didn't change               97
None of the above                  18
Other / I prefer not to answer     10
Name: count, dtype: int64

prof_situation_grouped
Yes_changed                       62.57
No, it didn't change              29.04
None of the above                  5.39
Other / I prefer not to answer     2.99
Name: count, dtype: float64

#Subplot of profess_feel and prof_situation_change
plt.figure(figsize=(20, 6))

# First Subplot on the left
plt.subplot(1, 2, 1)
ax1 = sns.barplot(x='In % of total', y=profess_feel.index, data=profess_feel,
                  palette=['darkblue' if val >= profess_feel['In % of total'].nlargest(3).min() else 'gray'
                           for val in profess_feel['In % of total']])
ax1.set_ylabel('Feelings')
ax1.set_title('Distribution of Professional Feelings')
ax1.set_xticks(np.arange(0, 61, 20))
ax1.set_xticklabels([f'{x}%' for x in np.arange(0, 61, 20)])


# 2 Subplot on the right
plt.subplot(1, 2, 2)
ax2 = sns.barplot(x=prof_situation_change_percentage.values, y=prof_situation_change_percentage.index, palette='viridis')
plt.title('Professional Situation Changes')
plt.ylabel('Categories')
ax2.set_xticks(np.arange(0, 101, 20))
ax2.set_xticklabels([f'{x}%' for x in np.arange(0, 101, 20)])
ax2.grid(False)
plt.tight_layout()
plt.show()

comfort = ['Comfort_group_21', 'Comfort_go_public_21','Comfort_publictrans_21', 'Comfort_event_outdoor_21',
           'Comfort_group_22', 'Comfort_go_public_22','Comfort_publictrans_22', 'Comfort_event_outdoor_22']

count_comfort = covidclean[comfort].mean().round(2)
count_comfort

Comfort_group_21            3.83
Comfort_go_public_21        3.26
Comfort_publictrans_21      2.93
Comfort_event_outdoor_21    2.92
Comfort_group_22            4.51
Comfort_go_public_22        4.19
Comfort_publictrans_22      3.74
Comfort_event_outdoor_22    3.90
dtype: float64

# Define groupings for each year (rank from 1 to 5)

comfort_2021 = ['Comfort_group_21', 'Comfort_go_public_21', 'Comfort_publictrans_21', 'Comfort_event_outdoor_21']
comfort_2022 = ['Comfort_group_22', 'Comfort_go_public_22', 'Comfort_publictrans_22', 'Comfort_event_outdoor_22']

# Create a DataFrame for each year
comfort_2021_data = covidclean[comfort_2021]
comfort_2022_data = covidclean[comfort_2022]

# Calculate the mean comfort level for each activity in 2020 and 2021
mean_comfort_2021 = comfort_2021_data.mean()
mean_comfort_2022 = comfort_2022_data.mean()

# Create a DataFrame for visualization
mean_comfort_comparison_data = pd.DataFrame({
    'Activity': ['Group', 'Go_public', 'Public_trans', 'Event_outdoor'],
    '2021': mean_comfort_2021.values.tolist(),
    '2022': mean_comfort_2022.values.tolist()
})

# Melt the DataFrame for seaborn barplot
melted_data = mean_comfort_comparison_data.melt(id_vars='Activity', var_name='Year', value_name='Mean_Comfort_Level').round(2)

# Extract the activity names without the year suffix
activity_labels = mean_comfort_comparison_data['Activity']

# Plotting side-by-side bar plots
plt.figure(figsize=(8, 6))
sns.barplot(x='Activity', y='Mean_Comfort_Level', hue='Year', data=melted_data)

# Set y-axis limits
plt.ylim(0, 5)
plt.yticks(np.arange(0, 5.9, 1))
plt.title('Comfort Levels by Activity')
#plt.xlabel('Comfort Activity')
plt.ylabel('Mean of Comfort Level')
plt.legend(title='Year', bbox_to_anchor=(1, 1), loc='upper left')
plt.grid(axis='y', alpha=0.4)
plt.show()

scale_mapping = {
    1: 'Strongly Disagree',
    2: 'Disagree',
    3: 'Neutral',
    4: 'Agree',
    5: 'Strongly Agree'
}

sorted_categories = ['Comfort_group_21', 'Comfort_group_22', 'Comfort_go_public_21', 'Comfort_go_public_22',
                     'Comfort_publictrans_21','Comfort_publictrans_22', 'Comfort_event_outdoor_21', 'Comfort_event_outdoor_22']



df_sorted = covidclean[sorted_categories].applymap(lambda x: scale_mapping.get(x))

response_counts_heatmap = pd.DataFrame(index=sorted_categories, columns=scale_mapping.values())
for column in sorted_categories:
    counts = df_sorted[column].value_counts()
    for response in response_counts_heatmap.columns:
        response_counts_heatmap.at[column, response] = counts.get(response, 0)

response_counts_heatmap = response_counts_heatmap.apply(pd.to_numeric)

response_counts_bar = response_counts_heatmap.copy()



# Assuming ax is defined before this point
fig, ax = plt.subplots(figsize=(14, 6))

# Define new colors
colors = ['red', 'salmon', 'gray', 'lightblue', 'blue']

left_starts = -response_counts_bar['Disagree'] - response_counts_bar['Strongly Disagree']

for response, color in zip(response_counts_bar.columns, colors):
    ax.barh(response_counts_bar.index, response_counts_bar[response], left=left_starts, color=color, label=response)
    left_starts += response_counts_bar[response]

ax.spines['left'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.xaxis.set_major_formatter(plt.NullFormatter())
ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=len(response_counts_bar.columns), frameon=False)
ax.set_title('Comfortable levels in social activities in 2021 and 2022', fontsize=16, fontweight='bold', position=(0.5, 1.1))
ax.set_yticks(range(len(sorted_categories)))
ax.set_yticklabels(["Group in 21", "Group in 22", "Go public in 21", "Go public in 22", "Public transport in 21",
                    "Public transport in 22", "Event outdoor in 21", "Event outdoor in 22"], fontsize=10)
plt.tight_layout()
plt.show()

# Select relevant columns for comfort levels
comfort_columns = [
    'Comfort_group_21', 'Comfort_go_public_21',
    'Comfort_publictrans_21', 'Comfort_event_outdoor_21',
    'Comfort_group_22', 'Comfort_go_public_22',
    'Comfort_publictrans_22', 'Comfort_event_outdoor_22'
]

# Extract the relevant data
comfort_data = covidclean[comfort_columns]

# Calculate the correlation matrix
correlation_matrix = comfort_data.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Create a heatmap for visualization, only showing the lower triangle
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='BuPu', fmt=".2f", linewidths=.5, mask=mask)
plt.title('Correlation Matrix of Comfort Levels')
plt.show()

covidclean.dropna(subset = ['Feel_normal']).head()

def plot_text(ax, x, text, color):
    """Helper function to plot text on axes."""
    ax.text(x, ax.get_ylim()[0] - (ax.get_ylim()[1] - ax.get_ylim()[0]) * 0.12, text, ha='center', va='top', color=color, fontsize=10)

fig, ax = plt.subplots(figsize=(12, 4))

# Plot the histogram on the first subplot
sns.histplot(x="Feel_normal", data=covidclean, binwidth=0.4, color='purple', kde=True)
plot_text(ax ,1, 'Not normal at all', 'red')
plot_text(ax, 10, 'Completely normal', 'green')
ax.set(ylabel="Number of people");

# Convert the 'Age' column to a categorical type with the proper order
age_order = ['18-20', '20-24', '24-30', '30-40', '40-50', '50-60', '60+']
covidclean['Age'] = pd.Categorical(covidclean['Age'], categories=age_order, ordered=True)

# Filter out the 'I prefer not to answer' entries
cc_filtered = covidclean[(covidclean['Age'] != 'I prefer not to answer')]

# Create a figure with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,4))

sns.boxplot(data=cc_filtered, x="Feel_normal", y="City", palette='tab10', medianprops=dict(color="yellow", alpha=0.7), ax=ax1)
plot_text(ax1, 1, 'Not normal at all', 'red')
plot_text(ax1, 10, 'Completely normal', 'green')
ax1.set(ylabel="Number of people")

sns.boxplot(data=cc_filtered, x="Feel_normal", y="Age", palette='tab10',ax=ax2)
plot_text(ax2, 1, 'Not normal at all', 'red')
plot_text(ax2, 10, 'Completely normal', 'green')
ax2.set(ylabel="Age")

# Adjust layout and show the plot
fig.suptitle('Feel Normal after the pandemic', fontsize=20)
plt.tight_layout()
plt.show()

general_emotion=cc_filtered.dropna(subset = ['Emotion'])

general_emotion.groupby(['Emotion']).size()

Emotion
Anxious          71
Disappointed     56
Distant          50
Free             40
Frustrated      130
Lonely           52
Optimistic       38
Peaceful         35
Stressed         61
Vulnerable       61
dtype: int64

positive_emotions = ['Free', 'Optimistic', 'Peaceful']
negative_emotions = ['Anxious', 'Disappointed', 'Frustrated','Distant','Lonely','Stressed','Vulnerable']

# Function to classify each emotion
def classify_emotion(emotion):
    if emotion in positive_emotions:
        return 'Positive'
    elif emotion in negative_emotions:
        return 'Negative'

general_emotion['Emotion_Type'] = general_emotion['Emotion'].apply(classify_emotion)

# Create a figure with 1 row and 2 columns
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))

# Plot the countplot on the first subplot
sns.countplot(y="Emotion", hue='Emotion_Type', data=general_emotion, order=general_emotion['Emotion'].value_counts(ascending=False).index, ax=axes[0])

# Plot the countplot on the second subplot
ax2 = sns.countplot(x='Emotion_Type', data=general_emotion, ax=axes[1])

# Calculate total number of entries in the data to find the percentage
total = len(general_emotion['Emotion_Type'])

# Annotate each bar with the percentage
for p in ax2.patches:
    height = p.get_height()  # Height of the bar represents the count
    percentage = 100 * height / total  # Calculate percentage
    # Adding text to each bar
    ax2.text(p.get_x() + p.get_width()/2., height + 3, '{:1.1f}%'.format(percentage), ha="center")


# Adjust layout and show the plot
fig.suptitle('Emotions Describing COVID-19 people in New York', fontsize=20)
plt.tight_layout()
plt.show()

general_emotion['Emotion'].value_counts(ascending=False)

Emotion
Frustrated      130
Anxious          71
Stressed         61
Vulnerable       61
Disappointed     56
Lonely           52
Distant          50
Free             40
Optimistic       38
Peaceful         35
Name: count, dtype: int64

cc_filtered2 = general_emotion[(general_emotion['Job_sector'] != 'Other / I prefer not to answer')]

emotion_counts = pd.crosstab(cc_filtered2['Job_sector'], cc_filtered2['Emotion_Type'])

# Let's plot the grouped bar chart using the crosstab
plt.figure(figsize=(10, 8))
emotion_counts.plot(kind='bar', stacked=False, figsize=(14,7))

plt.title('Count of Emotions Associated with COVID-19 by Job Sector')
plt.xlabel('Job Sector')
plt.ylabel('Count of Emotions')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Emotions', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()

plt.show()

<Figure size 1000x800 with 0 Axes>

emotion_counts = pd.crosstab(general_emotion['Age'], general_emotion['Emotion_Type'])

# Let's plot the grouped bar chart using the crosstab
plt.figure(figsize=(10, 8))
emotion_counts.plot(kind='bar', stacked=False, figsize=(14,7))

plt.title('Count of Emotions Associated with COVID-19 by Age')
plt.xlabel('Age')
plt.ylabel('Count of Emotions')
plt.xticks(rotation=0, ha='right')
plt.legend(title='Emotions', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout();
plt.show();

<Figure size 1000x800 with 0 Axes>

tweets = pd.read_csv('data/covid19_tweets.csv')
tweets

tweets1=pd.DataFrame(tweets[['text','user_location']])
tweets1

from textblob import TextBlob
import re

# Function to clean the text data
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#', '', text)  # Remove hashtags symbol
    text = re.sub(r'RT[\s]+', '', text)  # Remove RT (retweet symbol)
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuations
    text = text.lower()  # Convert to lower case
    return text

# Filter the dataset for COVID-19 related texts
covid_related_terms = ['covid', 'covid19', 'coronavirus']
tweets1['text_cleaned'] = tweets1['text'].apply(clean_text)
data_covid = tweets1[tweets1['text_cleaned'].str.contains('|'.join(covid_related_terms))]

# Function to analyze sentiment
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity, analysis.sentiment.subjectivity

# Applying sentiment analysis
data_covid['polarity'], data_covid['subjectivity'] = zip(*data_covid['text_cleaned'].apply(analyze_sentiment))

# Classify sentiments as positive, negative, or neutral
data_covid['sentiment'] = data_covid['polarity'].apply(lambda x: 'positive' if x > 0 else ('negative' if x < 0 else 'neutral'))

# Displaying the first few rows of the processed data
data_covid[['text', 'text_cleaned', 'polarity', 'subjectivity', 'sentiment','user_location']].head()

pd.set_option('display.max_colwidth', None)
data_covid.head(50)

data_covid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113141 entries, 2 to 179106
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   text           113141 non-null  object 
 1   user_location  90423 non-null   object 
 2   text_cleaned   113141 non-null  object 
 3   polarity       113141 non-null  float64
 4   subjectivity   113141 non-null  float64
 5   sentiment      113141 non-null  object 
dtypes: float64(2), object(4)
memory usage: 6.0+ MB

data_covid.groupby('sentiment')['sentiment'].value_counts()

sentiment
negative    18566
neutral     47800
positive    46775
Name: count, dtype: int64

# Counting the number of each sentiment
sentiment_counts = data_covid['sentiment'].value_counts()

# Creating the bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title('Distribution of Sentiments in COVID-19 Related Tweets')
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.show()

data_covid['user_location'] = data_covid['user_location'].str.lower()
filtered_counts = data_covid[data_covid['user_location'] == 'milan, italy']['sentiment'].value_counts()
filtered_counts

sentiment
positive    4
neutral     3
Name: count, dtype: int64

from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')


# Initializing TextBlob with NaiveBayesAnalyzer
tb = Blobber(analyzer=NaiveBayesAnalyzer())

# List of specific emotions for categorization
emotions = ['Free', 'Optimistic', 'Peaceful', 'Anxious', 'Disappointed', 'Frustrated', 'Distant', 'Lonely', 'Stressed', 'Vulnerable']

# Function to categorize text into specific emotions
# Note: This is a basic implementation. More complex NLP techniques might be needed for accurate categorization.
def categorize_emotions(text):
    analysis = tb(text)
    categorized_emotions = []

    # Basic keyword matching for emotions
    for emotion in emotions:
        if emotion.lower() in text:
            categorized_emotions.append(emotion)

    # If no specific emotion is found, classify based on sentiment
    if not categorized_emotions:
        if analysis.sentiment.classification == 'pos':
            return ['Optimistic']
        elif analysis.sentiment.classification == 'neg':
            return ['Anxious']
        else:
            return ['Neutral']

    return categorized_emotions

# Applying the emotion categorization function to the dataset
data_covid['emotions'] = data_covid['text_cleaned'].apply(categorize_emotions)

# Displaying the first few rows of the dataset with categorized emotions
data_covid[['text', 'emotions']].head()

data_covid.emotions.value_counts()

emotions
[Optimistic]             76894
[Anxious]                34787
[Free]                    1112
[Vulnerable]               193
[Stressed]                  42
[Peaceful]                  36
[Disappointed]              25
[Distant]                   18
[Lonely]                    14
[Frustrated]                11
[Free, Peaceful]             4
[Free, Stressed]             1
[Lonely, Stressed]           1
[Anxious, Frustrated]        1
[Anxious, Stressed]          1
[Free, Lonely]               1
Name: count, dtype: int64

from nltk.corpus import wordnet as wn
import nltk

# Download necessary NLTK data
nltk.download('wordnet')

# Function to get synonyms and related words for each emotion
def get_related_words(word):
    synonyms = set()
    for synset in wn.synsets(word):
        for lemma in synset.lemmas():
            synonyms.add(lemma.name())  # Add the synonyms
            if lemma.antonyms():  # Optionally, add antonyms
                synonyms.add(lemma.antonyms()[0].name())
    return synonyms

# Extending the list of emotions with their synonyms and related words
extended_emotions = {}
for emotion in emotions:
    extended_emotions[emotion] = get_related_words(emotion.lower())

# Function for emotion categorization with extended emotion words
def advanced_emotion_categorization(text):
    found_emotions = []
    for emotion, related_words in extended_emotions.items():
        if any(word in text for word in related_words):
            found_emotions.append(emotion)
    return found_emotions if found_emotions else ['None']

# Applying the advanced emotion categorization function to the dataset
data_covid['advanced_emotions'] = data_covid['text_cleaned'].apply(advanced_emotion_categorization)

# Displaying the first few rows of the dataset with advanced categorized emotions
data_covid[['text', 'advanced_emotions']].head()

data_covid.advanced_emotions.value_counts()

advanced_emotions
[None]                                  100337
[Free]                                    4432
[Stressed]                                3128
[Frustrated]                              2371
[Distant]                                 1249
[Anxious]                                  457
[Lonely]                                   217
[Frustrated, Stressed]                     196
[Vulnerable]                               179
[Free, Stressed]                           113
[Free, Frustrated]                          84
[Disappointed, Frustrated]                  71
[Optimistic]                                50
[Peaceful]                                  36
[Free, Distant]                             35
[Distant, Stressed]                         30
[Disappointed]                              17
[Free, Anxious]                             17
[Frustrated, Distant]                       17
[Lonely, Stressed]                          15
[Anxious, Stressed]                         12
[Free, Lonely]                              10
[Anxious, Frustrated]                        9
[Free, Frustrated, Stressed]                 7
[Free, Peaceful]                             7
[Frustrated, Vulnerable]                     4
[Frustrated, Lonely]                         4
[Free, Vulnerable]                           4
[Stressed, Vulnerable]                       3
[Free, Distant, Stressed]                    3
[Anxious, Distant]                           3
[Free, Disappointed, Frustrated]             3
[Optimistic, Peaceful]                       2
[Anxious, Vulnerable]                        2
[Anxious, Lonely]                            2
[Frustrated, Distant, Stressed]              1
[Free, Optimistic, Frustrated]               1
[Anxious, Disappointed, Frustrated]          1
[Optimistic, Stressed]                       1
[Disappointed, Frustrated, Distant]          1
[Peaceful, Frustrated]                       1
[Free, Disappointed]                         1
[Disappointed, Frustrated, Stressed]         1
[Peaceful, Stressed]                         1
[Disappointed, Frustrated, Lonely]           1
[Optimistic, Frustrated]                     1
[Lonely, Vulnerable]                         1
[Free, Frustrated, Distant]                  1
[Free, Anxious, Lonely]                      1
[Anxious, Frustrated, Lonely]                1
Name: count, dtype: int64

# Manually defining common variations for each emotion
emotion_variations = {
    'Free': ['free', 'freedom', 'freely'],
    'Optimistic': ['optimistic', 'optimism', 'optimistically'],
    'Peaceful': ['peaceful', 'peace', 'peacefully'],
    'Anxious': ['anxious', 'anxiety', 'anxiously'],
    'Disappointed': ['disappointed', 'disappointing', 'disappointment'],
    'Frustrated': ['frustrated', 'frustration', 'frustratingly'],
    'Distant': ['distant', 'distance', 'distantly'],
    'Lonely': ['lonely', 'loneliness', 'lone'],
    'Stressed': ['stressed', 'stress', 'stressing'],
    'Vulnerable': ['vulnerable', 'vulnerability', 'vulnerably']
}

# Function for simple emotion categorization based on defined variations
def simple_variation_emotion_categorization(text):
    found_emotions = []
    for emotion, variations in emotion_variations.items():
        if any(variation in text for variation in variations):
            found_emotions.append(emotion)
    return found_emotions if found_emotions else ['None']

# Applying the function to the dataset
data_covid['variation_emotions'] = data_covid['text_cleaned'].apply(simple_variation_emotion_categorization)

# Displaying the first few rows of the dataset with categorized emotions
data_covid[['text', 'variation_emotions']].head()

data_covid.variation_emotions.value_counts()

variation_emotions
[None]                             110530
[Free]                               1093
[Distant]                             314
[Stressed]                            268
[Lonely]                              236
[Vulnerable]                          207
[Peaceful]                            154
[Anxious]                             146
[Optimistic]                           69
[Disappointed]                         40
[Anxious, Stressed]                    25
[Frustrated]                           18
[Free, Stressed]                        7
[Free, Anxious]                         6
[Free, Peaceful]                        6
[Lonely, Stressed]                      5
[Free, Lonely]                          3
[Anxious, Frustrated]                   2
[Free, Distant]                         2
[Distant, Stressed]                     2
[Anxious, Stressed, Vulnerable]         1
[Free, Optimistic]                      1
[Frustrated, Stressed]                  1
[Peaceful, Stressed]                    1
[Frustrated, Lonely]                    1
[Lonely, Vulnerable]                    1
[Distant, Vulnerable]                   1
[Anxious, Lonely]                       1
Name: count, dtype: int64

# Extracting the count data from the provided images

general_ny = {
    'Frustrated' : 130,
    'Anxious' : 71,
    'Stressed' : 61,
    'Vulnerable' : 61,
    'Disappointed' : 56,
    'Lonely' : 52,
    'Distant' : 50,
    'Free' : 40,
    'Optimistic' : 38,
    'Peaceful' : 35
}

advanced_emotion_data = {
    'Optimistic': 76894,
    'Anxious': 34787,
    'Free': 1112,
    'Vulnerable': 193,
    'Stressed': 42,
    'Peaceful': 36,
    'Disappointed': 25,
    'Distant': 18,
    'Lonely': 14,
    'Frustrated': 11
}

variation_emotion_data = {
    'None': 108337,
    'Free': 4432,
    'Stressed': 3128,
    'Frustrated': 2471,
    'Disappointed': 1294,
    'Anxious': 457,
    'Lonely': 217,
    'Frustrated, Stressed': 167,
    'Vulnerable': 119,
    'Free, Stressed': 84
}

basic_emotion_data = {
    'None': 110530,
    'Free': 1093,
    'Distant': 314,
    'Stressed': 268,
    'Lonely': 236,
    'Vulnerable': 207,
    'Peaceful': 154,
    'Anxious': 146,
    'Optimistic': 69,
    'Disappointed': 40
}

# For the purpose of comparison, we will consider the single emotions from the advanced method
# and we will exclude 'None' from the advanced and variation data
single_emotions_variation = {k: v for k, v in variation_emotion_data.items() if ',' not in k and k != 'None'}
single_emotions_basic = {k: v for k, v in basic_emotion_data.items() if ',' not in k and k != 'None'}

# Creating a DataFrame for comparison
emotion_comparison_df = pd.DataFrame({
    'Basic': single_emotions_basic, 
    'Advanced': advanced_emotion_data, 
    'Variation': single_emotions_variation,
    'New York' : general_ny
})

# Creating a non-stacked bar chart with integer totals and custom color palette
plt.figure(figsize=(14, 7))
ax = emotion_comparison_df.plot(kind='bar', ax=plt.gca(), width=0.9)
plt.yscale('log')  # Setting y-axis to logarithmic scale

# Adding integer total counts on top of each bar with slight vertical offset for better visibility
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() * 1.005, p.get_height() * 1.05), fontsize=8)

plt.title('Comparison of Emotion Categorization Approaches')
plt.ylabel('Number of Occurrences (Log Scale)')
plt.xlabel('Emotions')
plt.xticks(rotation=45)
plt.legend(title='Approach')
plt.tight_layout()  # Adjust the plot to ensure everything fits without overlapping
plt.show()

# Apply the filter
data_covid['advanced_emotions'].notna()

# Display the first 100 rows of the filtered data
data_covid.head(100)

	In % positive20	In % positive21	In % positive22
Work_Study_Status
Study	8.66	11.56	24.29
Work	3.20	2.70	1.72
Both	0.57	0.61	1.08
Neither	0.07	0.07	0.20

	Counts	In %
Feel_sick_Peace	546	33.62
Feel_sick_Frustrated	465	28.63
Feel_sick_Lonely	420	25.86
Feel_sick_Stressed	386	23.77
Feel_sick_Vulnerable	264	16.26
Feel_sick_Anxious	236	14.53
Feel_sick_Threatened	191	11.76
Feel_sick_Optimistic	187	11.51
Feel_sick_Disappointed	125	7.70

	Count	In % of total
Feel_study_Stressed	1250	20.27
Feel_study_Frustrated	1004	16.28
Feel_study_Bored	953	15.46
Feel_study_Lonely	941	15.26
Feel_study_Anxious	721	11.69
Feel_study_Disappointed	461	7.48
Feel_study_Optimistic	363	5.89
Feel_study_Free	336	5.45
Feel_study_Proud	137	2.22

	user_name	user_location	user_description	user_created	user_followers	user_friends	user_favourites	user_verified	date	text	hashtags	source	is_retweet
0	ᏉᎥ☻լꂅϮ	astroworld	wednesday addams as a disney princess keepin i...	2017-05-26 05:46:42	624	950	18775	False	2020-07-25 12:27:21	If I smelled the scent of hand sanitizers toda...	NaN	Twitter for iPhone	False
1	Tom Basile 🇺🇸	New York, NY	Husband, Father, Columnist & Commentator. Auth...	2009-04-16 20:06:23	2253	1677	24	True	2020-07-25 12:27:17	Hey @Yankees @YankeesPR and @MLB - wouldn't it...	NaN	Twitter for Android	False
2	Time4fisticuffs	Pewee Valley, KY	#Christian #Catholic #Conservative #Reagan #Re...	2009-02-28 18:57:41	9275	9525	7254	False	2020-07-25 12:27:14	@diane3443 @wdunlap @realDonaldTrump Trump nev...	['COVID19']	Twitter for Android	False
3	ethel mertz	Stuck in the Middle	#Browns #Indians #ClevelandProud #[]_[] #Cavs ...	2019-03-07 01:45:06	197	987	1488	False	2020-07-25 12:27:10	@brookbanktv The one gift #COVID19 has give me...	['COVID19']	Twitter for iPhone	False
4	DIPR-J&K	Jammu and Kashmir	🖊️Official Twitter handle of Department of Inf...	2017-02-12 06:45:15	101009	168	101	False	2020-07-25 12:27:08	25 July : Media Bulletin on Novel #CoronaVirus...	['CoronaVirusUpdates', 'COVID19']	Twitter for Android	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...
179103	AJIMATI AbdulRahman O.	Ilorin, Nigeria	Animal Scientist\|\| Muslim\|\| Real Madrid/Chelsea	2013-12-30 18:59:19	412	1609	1062	False	2020-08-29 19:44:21	Thanks @IamOhmai for nominating me for the @WH...	['WearAMask']	Twitter for Android	False
179104	Jason	Ontario	When your cat has more baking soda than Ninja ...	2011-12-21 04:41:30	150	182	7295	False	2020-08-29 19:44:16	2020! The year of insanity! Lol! #COVID19 http...	['COVID19']	Twitter for Android	False
179105	BEEHEMOTH ⏳	🇨🇦 Canada	⚒️ The Architects of Free Trade ⚒️ Really Did ...	2016-07-13 17:21:59	1623	2160	98000	False	2020-08-29 19:44:15	@CTVNews A powerful painting by Juan Lucena. I...	NaN	Twitter Web App	False
179106	Gary DelPonte	New York City	Global UX UI Visual Designer. StoryTeller, Mus...	2009-10-27 17:43:13	1338	1111	0	False	2020-08-29 19:44:14	More than 1,200 students test positive for #CO...	['COVID19']	Twitter for iPhone	False
179107	TUKY II	Aliwal North, South Africa	TOKELO SEKHOPA \| TUKY II \| LAST BORN \| EISH TU...	2018-04-14 17:30:07	97	1697	566	False	2020-08-29 19:44:08	I stop when I see a Stop\n\n@SABCNews\n@Izinda...	NaN	Twitter for Android	False

	text	user_location
0	If I smelled the scent of hand sanitizers toda...	astroworld
1	Hey @Yankees @YankeesPR and @MLB - wouldn't it...	New York, NY
2	@diane3443 @wdunlap @realDonaldTrump Trump nev...	Pewee Valley, KY
3	@brookbanktv The one gift #COVID19 has give me...	Stuck in the Middle
4	25 July : Media Bulletin on Novel #CoronaVirus...	Jammu and Kashmir
...	...	...
179103	Thanks @IamOhmai for nominating me for the @WH...	Ilorin, Nigeria
179104	2020! The year of insanity! Lol! #COVID19 http...	Ontario
179105	@CTVNews A powerful painting by Juan Lucena. I...	🇨🇦 Canada
179106	More than 1,200 students test positive for #CO...	New York City
179107	I stop when I see a Stop\n\n@SABCNews\n@Izinda...	Aliwal North, South Africa

Exploring Perceptions and Emotions During COVID-19¶

A Mixed-Data Analysis of Survey Responses and Twitter Content¶

Introduction¶

Data Set Resource

Checking information of Covid Data Frame¶

Renaming Columns

Checking some columns after rename¶

Checking missing values¶

Cleaning Columns for Missing Values

Counting The Number of "No" in each column¶

Removing Columns

Levels of agreetment about measures COVID-19¶

Perception people not vaccine¶

Perception upon testing positive¶

Positive Test Results by Study Work status¶

Positive Test Results by Gender¶

- Overall Test Positive on COVID-19 (once /twice / thrice)¶

Assess feelings about studying during the pandemic, for students.¶

Examine the emotional responses of professionals in the work environment during COVID-19 and the changes presented in their professional situation.¶

Perceptions of professionals in the work environment during COVID-19¶

Individual’s perception of returning to one's pre-pandemic self.¶

Predominant emotion participants associate with COVID-19¶

Cleaning Data Set Tweets¶

Checking for country and city Milan and New York in the tweets data set¶

Checking the advanced results for the analysis in some tweets¶

	City	At the beginning of the pandemic did you work or study?	Family	Friends	Pets	Roommates	People I didn't know	Alone	How many people did you live with in that period?	Bored.3c	...	Taking public transportation.1	Going to a crowded event outdoors.1	Do you feel you have gone back to your “normality”?	What is your job sector?	What is your major?	What is your gender?	How old are you now?	Where did you grow up?	What is your ethnicity?	Which of these emotions do you associate with COVID-19?
0	New York	Study	Family	NaN	NaN	NaN	NaN	NaN	3.0	NaN	...	3.0	2.0	4.0	NaN	Engineering and Technology	Male	24-30	I come from abroad	Asian	Stressed
1	New York	Both	Family	NaN	NaN	NaN	NaN	NaN	2.0	Bored	...	4.0	4.0	4.0	NaN	Engineering and Technology	Female	24-30	I'm from somewhere else in the US	Asian	Frustrated
2	New York	Work	NaN	NaN	NaN	Roommates	NaN	NaN	2.0	NaN	...	5.0	5.0	8.0	Information Technology	NaN	Male	30-40	I come from abroad	Asian	Distant
3	New York	Work	NaN	NaN	Pets	NaN	NaN	NaN	NaN	Bored	...	4.0	3.0	5.0	Food & Consumer Goods	NaN	Male	40-50	I'm from New York State	White	Frustrated
4	New York	Work	NaN	NaN	NaN	NaN	NaN	Alone	NaN	NaN	...	5.0	5.0	6.0	Food & Consumer Goods	NaN	Male	30-40	I'm from New York State	White	Optimistic
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
3000	Milan	Study	Family	NaN	NaN	NaN	NaN	NaN	1.0	NaN	...	2.0	4.0	5.0	NaN	Built Environment and Design	Female	24-30	I'm Italian	White	NaN
3001	Milan	Study	Family	NaN	NaN	NaN	NaN	NaN	2.0	NaN	...	5.0	5.0	9.0	NaN	Commerce, Management, Tourism and Services	Male	20-24	I'm Italian	White	NaN
3002	Milan	Work	Family	NaN	NaN	NaN	NaN	NaN	3.0	NaN	...	1.0	5.0	4.0	Other / I prefer not to answer	NaN	Female	50-60	I'm Italian	White	NaN
3003	Milan	Work	Family	NaN	NaN	NaN	NaN	NaN	3.0	NaN	...	3.0	5.0	9.0	Information Technology	NaN	Male	50-60	I'm Italian	White	NaN
3004	Milan	Study	Family	NaN	NaN	NaN	NaN	NaN	3.0	NaN	...	3.0	5.0	7.0	NaN	Studies in Human Society	Male	18-20	I come from abroad	White	NaN

	Count	In % of total
Work_Study_Status
Study	2476	83.423181
Work	334	11.253369
Both	136	4.582210
Neither	22	0.741240

	For Male	For Female	Male in % of Total	Female in % of Total
positive20	224	147	13.793103	9.051724
positive21	278	165	17.118227	10.160099
positive22	480	330	29.556650	20.320197

	Counts	In %
Total count of positive	1624	100.00
Positive Once	1411	86.88
Positive Twice	167	10.28
Positive Thrice	46	2.83

	Count	In % of total
Feeling_Category
Negative	2308	77.76
Neutral	356	11.99
Positive	304	10.24

	Count	In % of total
Feel_profess_Vulnerable	109	16.49
Feel_profess_Disappointed	88	13.31
Feel_profess_Frustrated	86	13.01
Feel_profess_Optimistic	74	11.20
Feel_profess_Stressed	65	9.83
Feel_profess_Lonely	64	9.68
Feel_profess_Free	60	9.08
Feel_profess_Aggressive	58	8.77
Feel_profess_Enthusiastic	57	8.62

	text	text_cleaned	polarity	subjectivity	sentiment	user_location
2	@diane3443 @wdunlap @realDonaldTrump Trump nev...	trump never once claimed covid19 was a hoax...	0.00	0.000000	neutral	Pewee Valley, KY
3	@brookbanktv The one gift #COVID19 has give me...	the one gift covid19 has give me is an apprec...	0.00	0.357143	neutral	Stuck in the Middle
4	25 July : Media Bulletin on Novel #CoronaVirus...	25 july media bulletin on novel coronavirusup...	0.00	0.000000	neutral	Jammu and Kashmir
5	#coronavirus #covid19 deaths continue to rise....	coronavirus covid19 deaths continue to rise it...	-0.70	0.666667	negative	Новоро́ссия
6	How #COVID19 Will Change Work in General (and ...	how covid19 will change work in general and re...	0.05	0.500000	positive	Gainesville, FL

	text	emotions
2	@diane3443 @wdunlap @realDonaldTrump Trump never once claimed #COVID19 was a hoax. We all claim that this effort to… https://t.co/Jkk8vHWHb3	[Optimistic]
3	@brookbanktv The one gift #COVID19 has give me is an appreciation for the simple things that were always around me… https://t.co/Z0pOAlFXcW	[Optimistic]
4	25 July : Media Bulletin on Novel #CoronaVirusUpdates #COVID19 \n@kansalrohit69 @DrSyedSehrish @airnewsalerts @ANI… https://t.co/MN0EEcsJHh	[Optimistic]
5	#coronavirus #covid19 deaths continue to rise. It's almost as bad as it ever was. Politicians and businesses want… https://t.co/hXMHooXX2C	[Optimistic]
6	How #COVID19 Will Change Work in General (and recruiting, specifically) via/ @ProactiveTalent #Recruiting… https://t.co/bjZxzGPMbK	[Optimistic]

	text	advanced_emotions
2	@diane3443 @wdunlap @realDonaldTrump Trump never once claimed #COVID19 was a hoax. We all claim that this effort to… https://t.co/Jkk8vHWHb3	[None]
3	@brookbanktv The one gift #COVID19 has give me is an appreciation for the simple things that were always around me… https://t.co/Z0pOAlFXcW	[None]
4	25 July : Media Bulletin on Novel #CoronaVirusUpdates #COVID19 \n@kansalrohit69 @DrSyedSehrish @airnewsalerts @ANI… https://t.co/MN0EEcsJHh	[None]
5	#coronavirus #covid19 deaths continue to rise. It's almost as bad as it ever was. Politicians and businesses want… https://t.co/hXMHooXX2C	[None]
6	How #COVID19 Will Change Work in General (and recruiting, specifically) via/ @ProactiveTalent #Recruiting… https://t.co/bjZxzGPMbK	[None]

	text	user_location	text_cleaned	polarity	subjectivity	sentiment
2	@diane3443 @wdunlap @realDonaldTrump Trump never once claimed #COVID19 was a hoax. We all claim that this effort to… https://t.co/Jkk8vHWHb3	Pewee Valley, KY	trump never once claimed covid19 was a hoax we all claim that this effort to	0.000000	0.000000	neutral
3	@brookbanktv The one gift #COVID19 has give me is an appreciation for the simple things that were always around me… https://t.co/Z0pOAlFXcW	Stuck in the Middle	the one gift covid19 has give me is an appreciation for the simple things that were always around me	0.000000	0.357143	neutral
4	25 July : Media Bulletin on Novel #CoronaVirusUpdates #COVID19 \n@kansalrohit69 @DrSyedSehrish @airnewsalerts @ANI… https://t.co/MN0EEcsJHh	Jammu and Kashmir	25 july media bulletin on novel coronavirusupdates covid19 \n	0.000000	0.000000	neutral
5	#coronavirus #covid19 deaths continue to rise. It's almost as bad as it ever was. Politicians and businesses want… https://t.co/hXMHooXX2C	Новоро́ссия	coronavirus covid19 deaths continue to rise its almost as bad as it ever was politicians and businesses want	-0.700000	0.666667	negative
6	How #COVID19 Will Change Work in General (and recruiting, specifically) via/ @ProactiveTalent #Recruiting… https://t.co/bjZxzGPMbK	Gainesville, FL	how covid19 will change work in general and recruiting specifically via recruiting	0.050000	0.500000	positive
8	Praying for good health and recovery of @ChouhanShivraj .\n#covid19\n#covidPositive	NaN	praying for good health and recovery of \ncovid19\ncovidpositive	0.700000	0.600000	positive
9	POPE AS GOD - Prophet Sadhu Sundar Selvaraj. Watch here at https://t.co/7X5RTyKVji \n\n#HurricaneHanna #COVID19… https://t.co/06dZcGUnwL	👇🏻location at link below👇🏻	pope as god prophet sadhu sundar selvaraj watch here at \n\nhurricanehanna covid19	0.000000	0.000000	neutral
10	49K+ Covid19 cases still no response from \n@cbseindia29 @HRDMinistry @DrRPNishank.Please cancel the compartment exa… https://t.co/kV2ZKmumu1	NaN	49k covid19 cases still no response from \n please cancel the compartment exa	0.000000	0.000000	neutral
12	👋🏻@PattyHajdu @NavdeepSBains — no one will be safe from #COVID19 until everyone is safe. Will you commit to ensure… https://t.co/aWCJo6eKvC	Hotel living - various cities! Who needs a home when hotel living is so fabulous!	no one will be safe from covid19 until everyone is safe will you commit to ensure	0.500000	0.500000	positive
13	Let's all protect ourselves from #COVID19.\nIt's real and the numbers are climbing up fast in the Continent.\nLet's n… https://t.co/byMSfxltrw	Africa	lets all protect ourselves from covid19\nits real and the numbers are climbing up fast in the continent\nlets n	0.200000	0.450000	positive
14	Rajasthan Government today started a Plasma Bank at Sawai Man Singh Hospital in Jaipur for treatment of COVID-19 pa… https://t.co/cwfCcWyaDA	New Delhi	rajasthan government today started a plasma bank at sawai man singh hospital in jaipur for treatment of covid19 pa	0.000000	0.000000	neutral
15	Nagaland police on Covid-19 Awareness at City Tower Junction Dimapur.\n\n#Covid19 #keepsocialdistance… https://t.co/WOvFMqhuEO	Nagaland, India	nagaland police on covid19 awareness at city tower junction dimapur\n\ncovid19 keepsocialdistance	0.000000	0.000000	neutral
16	July 25 #COVID19 update\n#TamilNadu - 6988\nDischarge- 7758\nPeople tested - 61729\nActice cases - 52273\n#chennai - 1329	NaN	july 25 covid19 update\ntamilnadu 6988\ndischarge 7758\npeople tested 61729\nactice cases 52273\nchennai 1329	0.000000	0.000000	neutral
17	Second wave of #COVID19 in Flanders..back to more #homework again... https://t.co/9swImZACMN	Brussels	second wave of covid19 in flandersback to more homework again	0.250000	0.250000	positive
19	COVID Update: The infection rate in Florida is following the natural curve that experts predicted if the initial cu… https://t.co/hmEwE747WK	Florida, USA	covid update the infection rate in florida is following the natural curve that experts predicted if the initial cu	0.033333	0.166667	positive
21	Coronavirus - South Africa: COVID-19 update for South Africa (24 July 2020) @nicd_sa @MoetiTshidi @WHOAFRO… https://t.co/WQzAdkpXC5	#AFRICA #MENA	coronavirus south africa covid19 update for south africa 24 july 2020	0.000000	0.000000	neutral
23	The first comprehensive review of #WASH & #COVID19.\n\nAnalysis: key ways in which WASH can help reduce transmission… https://t.co/eCsfDmo9b4	100+ countries	the first comprehensive review of wash amp covid19\n\nanalysis key ways in which wash can help reduce transmission	0.125000	0.666667	positive
24	Holy water in times of #COVID19 https://t.co/YaZ49yxL27	Graz	holy water in times of covid19	0.000000	0.000000	neutral
25	#Kolar\nNeed #Blood Type : B-positive\nAt : Jalappa Hospital\nBlood Component : Need Plasma from B+ve #COVID19 recove… https://t.co/XtgxULc02Z	Mumbai, India	kolar\nneed blood type bpositive\nat jalappa hospital\nblood component need plasma from bve covid19 recove	0.000000	0.000000	neutral
27	I can imagine the same people profiting off the human suffering of #COVID19 will be studying these maps to make 207… https://t.co/NElsmGju2H	Manhattan, NY	i can imagine the same people profiting off the human suffering of covid19 will be studying these maps to make 207	0.000000	0.112500	neutral
28	#TNCoronaUpdate\n\n#TN crosses 2 lakh mark and 1,50,055 ppp has recovered so far.\n\n#COVID19 positive today 6988/total… https://t.co/QdJkpcc7qv	no e-pass to cross borders...	tncoronaupdate\n\ntn crosses 2 lakh mark and 150055 ppp has recovered so far\n\ncovid19 positive today 6988total	0.163636	0.772727	positive
30	#FEMA acknowledges #PuertoRico lacks rebuilt homes and a hospital to survive #COVID19 https://t.co/PLXqKc5K5d	United States	fema acknowledges puertorico lacks rebuilt homes and a hospital to survive covid19	0.000000	0.000000	neutral
31	Actor @VishalKOfficial and his father Mr.G.K.Reddy tested positive for #Covid19 15/20 days back..\n\nWhile #Vishal 's… https://t.co/V5Lo8NRLWY	Chennai, India	actor and his father mrgkreddy tested positive for covid19 1520 days back\n\nwhile vishal s	0.113636	0.272727	positive
32	An update on the total #covid19 cases, recoveries, and deaths reported in countries in #Africa. See the total numb… https://t.co/eqCJ3EhwRi	Miami, FL	an update on the total covid19 cases recoveries and deaths reported in countries in africa see the total numb	-0.200000	0.833333	negative
33	@unionwill @megawedgy @jjhorgan “Regarding the community outbreak of #COVID19 on Haida Gwaii: 13 cases (one recover… https://t.co/rJzuY60qY7	British Columbia, Canada	regarding the community outbreak of covid19 on haida gwaii 13 cases one recover	0.000000	0.000000	neutral
35	Our CEO @RamneekHH live with @SASCO_Jikelele WSU talking about, “The effects of Covid-19 on students”.… https://t.co/VMN2LxHAds	South Africa	our ceo live with wsu talking about the effects of covid19 on students	0.136364	0.500000	positive
36	@NicoleKowalski5 @chrisdameanor19 I'm feeling a little anxious because #COVID19 seems to have detracted everyone's… https://t.co/A3mHdXamLg	Deep Heart Texas	im feeling a little anxious because covid19 seems to have detracted everyones	-0.218750	0.750000	negative
37	Actionables for a healthy recovery from #COVID19 #climate https://t.co/crGlKZOz5Z	NaN	actionables for a healthy recovery from covid19 climate	0.500000	0.500000	positive
39	We released two new #COVID19 podcast episodes this week:\n-Technology Platforms Used to Conduct Telehealth Visits\n-C… https://t.co/G94VeYVCzk	Bethesda, Maryland	we released two new covid19 podcast episodes this week\ntechnology platforms used to conduct telehealth visits\nc	0.136364	0.454545	positive
40	#PlayYourPart in the fight against #COVID19 . Each one of us can make a difference in containing the spread, lets… https://t.co/fRzLgrS8Qf	Cavan, Ireland	playyourpart in the fight against covid19 each one of us can make a difference in containing the spread lets	0.000000	0.000000	neutral
41	Britain didn't understand #Covid19 for months, while plenty of other countries did understand it and took effective… https://t.co/KV2CAQ6skZ	A UK I no longer recognise	britain didnt understand covid19 for months while plenty of other countries did understand it and took effective	0.237500	0.587500	positive
42	Volume for those at the back please. 🔊 #COVID19 https://t.co/d0pn2Bv2Hx	Port Elizabeth, South Africa	volume for those at the back please covid19	0.000000	0.000000	neutral
43	#Delhi reports 1,142 #COVID19 positive cases and 29 deaths on Saturday. Death toll rises to 3,806, reports ANI quot… https://t.co/JxsVpWMRQS	New Delhi, India	delhi reports 1142 covid19 positive cases and 29 deaths on saturday death toll rises to 3806 reports ani quot	0.227273	0.545455	positive
45	#TamilNadu \| #COVID19 \| 25th July\n\n● TN - 6,988 (Highest Spike)\n\n● Total Cases - 2,06,737\n\n● Chennai - 1,329… https://t.co/X8zgUzC0Gr	chennai	tamilnadu covid19 25th july\n\n tn 6988 highest spike\n\n total cases 206737\n\n chennai 1329	0.000000	0.750000	neutral
46	Why has Ruto not eulogisied Mkapa!! \nAsking for Moses Kuria \n\nRaila \n#RutoSwindlingGEMA \n#COVID19	Nairobi	why has ruto not eulogisied mkapa \nasking for moses kuria \n\nraila \nrutoswindlinggema \ncovid19	0.000000	0.000000	neutral
47	Coronavirus Testing Fiasco: St Mirren have pledged to undertake an "urgent review" of their Covid-19 testing proced… https://t.co/0MCEUERQ74	worldwide	coronavirus testing fiasco st mirren have pledged to undertake an urgent review of their covid19 testing proced	0.000000	0.000000	neutral
48	Coronavirus Testing Fiasco: St Mirren have pledged to undertake an "urgent review" of their Covid-19 testing proced… https://t.co/AkxvIaD1Pr	Scotland	coronavirus testing fiasco st mirren have pledged to undertake an urgent review of their covid19 testing proced	0.000000	0.000000	neutral
49	#TamilNadu \| #COVID19 \| 25th July\n● TN - 6,988 (Highest Spike)\n● Total Cases - 2,06,737\n● Chennai - 1,329\n● T… https://t.co/ZWFKJwZdEo	NaN	tamilnadu covid19 25th july\n tn 6988 highest spike\n total cases 206737\n chennai 1329\n t	0.000000	0.750000	neutral
50	Coronavirus Testing Fiasco: St Mirren have pledged to undertake an "urgent review" of their Covid-19 testing proced… https://t.co/Ifse0whbVE	Australia	coronavirus testing fiasco st mirren have pledged to undertake an urgent review of their covid19 testing proced	0.000000	0.000000	neutral
51	Coronavirus Testing Fiasco: St Mirren have pledged to undertake an "urgent review" of their Covid-19 testing proced… https://t.co/dx8Yt7CvsM	UK	coronavirus testing fiasco st mirren have pledged to undertake an urgent review of their covid19 testing proced	0.000000	0.000000	neutral

Exploring Perceptions and Emotions During COVID-19¶

A Mixed-Data Analysis of Survey Responses and Twitter Content¶

Introduction¶

Data Set Resource

Checking information of Covid Data Frame¶

Renaming Columns

Checking some columns after rename¶

Checking missing values¶

Cleaning Columns for Missing Values

Counting The Number of "No" in each column¶

Removing Columns

Perceptions related with living with someone during the pandemic and living alone¶

Levels of agreetment about measures COVID-19¶

Perception people not vaccine¶

Perception upon testing positive¶

Positive Test Results by Study Work status¶

Positive Test Results by Gender¶

- Overall Test Positive on COVID-19 (once /twice / thrice)¶

Assess feelings about studying during the pandemic, for students.¶

Examine the emotional responses of professionals in the work environment during COVID-19 and the changes presented in their professional situation.¶

Perceptions of professionals in the work environment during COVID-19¶

Compare comfort levels in social activities between Winter 2021 and Summer 2022.¶

Individual’s perception of returning to one's pre-pandemic self.¶

Predominant emotion participants associate with COVID-19¶

Cleaning Data Set Tweets¶

Checking for country and city Milan and New York in the tweets data set¶

Checking the advanced results for the analysis in some tweets¶