Plots for the ICRC2023 Diversity Session

Plots for the ICRC2023 Diversity Session#

  • Making plots for ICRC2023 Diversity Session

import altair as alt
import pandas as pd
import titanite as ti

print(f"Altair {alt.__version__}")
print(f"Pandas {pd.__version__}")
print(f"Titanite {ti.__version__}")
Altair 5.5.0
Pandas 2.2.3
Titanite 0.6.0

Read data

f_cfg = "../../sandbox/config.toml"
f_csv = "../../data/test_data/prepared_data.csv"
d = ti.Data(read_from=f_csv, load_from=f_cfg)
config = d.config()
data = d.read()
# data
2025-05-12 10:03:10.189 | INFO     | titanite.preprocess:categorical_data:135 - Categorize

q13 の不要な値を削除する

q = "q13 >= 0"
q13_data = data.query(q)

Check correlation with q13 and q14

def heatmap(data: pd.DataFrame, x: str, y: str):
    domain_x = alt.Scale(domain=[-10, 100])

    base = (
        alt.Chart(data)
        .encode(
            alt.Y(y),
        )
        .properties(
            width=400,
            height=400,
        )
    )

    bin0 = base.mark_point().encode(alt.X(x).scale(domain_x))

    bin5 = base.mark_rect().encode(
        alt.X(x).scale(domain_x).bin(step=5),
        alt.Color("count()").scale(scheme="blues"),
    )

    bin10 = base.mark_rect().encode(
        alt.X(x).scale(domain_x).bin(step=5),
        alt.Color("count()").scale(scheme="blues"),
    )

    hm = bin0 & bin5 & bin10
    return hm
heatmap(q13_data, x="q13", y="q14")
q = "q14 != 'Prefer not to answer'"
q14_data = q13_data.query(q)
heatmap(q14_data, x="q13", y="q14")
q = "q03_regional != 'Prefer not to answer'"
q03_data = q13_data.query(q)
heatmap(q03_data, x="q13", y="q03_regional")
q = "q04_regional != 'Prefer not to answer'"
q04_data = q13_data.query(q)
heatmap(q04_data, x="q13", y="q03_regional")

q14の値(割合)でクラスター化する

  • 10 - 20%

  • 20 - 30%

  • 30 - 40%

def cluster_data(data: pd.DataFrame) -> pd.DataFrame:
    copied = data.copy()
    h = "q13_clustered"
    copied[h] = "Others"

    is_lower = copied["q13"] >= 0
    is_upper = copied["q13"] < 10
    isT = is_lower & is_upper
    copied.loc[isT, h] = "0% to 10%"

    is_lower = copied["q13"] >= 10
    is_upper = copied["q13"] < 20
    isT = is_lower & is_upper
    copied.loc[isT, h] = "10% to 20%"

    is_lower = copied["q13"] >= 20
    is_upper = copied["q13"] < 30
    isT = is_lower & is_upper
    copied.loc[isT, h] = "20% to 30%"

    is_lower = copied["q13"] >= 30
    is_upper = copied["q13"] < 40
    isT = is_lower & is_upper
    copied.loc[isT, h] = "30% to 40%"

    is_lower = copied["q13"] >= 40
    is_upper = copied["q13"] < 110
    isT = is_lower & is_upper
    copied.loc[isT, h] = "40% and over"

    h = "q03_clustered"
    copied[h] = "Others"
    isT = copied["q03_regional"] == "Asia"
    copied.loc[isT, h] = "Asia"
    isT = copied["q03_regional"] == "Europe"
    copied.loc[isT, h] = "Europe"
    isT = copied["q03_regional"] == "America"
    copied.loc[isT, h] = "America"

    h = "q04_clustered"
    copied[h] = "Others"
    isT = copied["q04_regional"] == "Asia"
    copied.loc[isT, h] = "Asia"
    isT = copied["q04_regional"] == "Europe"
    copied.loc[isT, h] = "Europe"
    isT = copied["q04_regional"] == "America"
    copied.loc[isT, h] = "America"

    return copied
q13_data = cluster_data(q13_data)
q03_data = cluster_data(q03_data)
q04_data = cluster_data(q04_data)
q13_grouped, h13 = ti.core.hbar(
    q13_data, x="q14", color="q13_clustered", title="q13-q14"
)
q03_grouped, h03 = ti.core.hbar(
    q03_data, x="q13", color="q03_clustered", title="q03-q13"
)
q04_grouped, h04 = ti.core.hbar(
    q04_data, x="q13", color="q04_clustered", title="q04-q13"
)
/home/runner/work/surveys/surveys/titanite/core.py:113: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = data.groupby(g)[c].sum().reset_index()
h13 & h03 & h04

クラスターごとの回答数で規格化する

q13_sum = q13_grouped.groupby("q13_clustered")["response"].sum().reset_index()
q13_merged = pd.merge(q13_grouped, q13_sum, on="q13_clustered")
q13_merged["response"] = q13_merged["response_x"] / q13_merged["response_y"]
# q13_merged
q03_sum = q03_grouped.groupby("q03_clustered")["response"].sum().reset_index()
q03_merged = pd.merge(q03_grouped, q03_sum, on="q03_clustered")
q03_merged["response"] = q03_merged["response_x"] / q03_merged["response_y"]
# q03_merged
q04_sum = q04_grouped.groupby("q04_clustered")["response"].sum().reset_index()
q04_merged = pd.merge(q04_grouped, q04_sum, on="q04_clustered")
q04_merged["response"] = q04_merged["response_x"] / q04_merged["response_y"]
# q04_merged
g, h = ti.core.hbar(q13_merged, x="q14", color="q13_clustered", title="hoge")
h
/home/runner/work/surveys/surveys/titanite/core.py:113: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = data.groupby(g)[c].sum().reset_index()
_, hbar03 = ti.core.hbar(q03_merged, x="q13", color="q03_clustered", title="hoge")
_, hbar04 = ti.core.hbar(q04_merged, x="q13", color="q04_clustered", title="hoge")
hbar03 & hbar04
hm03 = heatmap(q03_data, x="q13", y="q03_clustered")
hm04 = heatmap(q04_data, x="q13", y="q04_clustered")
hm03 | hm04

Plots for presentation#

  • X軸-Y軸

  • q14-q13_clustered

  • q13-q03_clustered

# q13_merged
opacity = 0.7
# scheme = "category20b"
# scheme = "category20c"
scheme = "set1"
question = "【Q14】What do you think about the percentage of the female researcher in your group ?"
title_x = "What you think"
q = "q13_clustered != 'Others'"
q13_x = (
    alt.Chart(q13_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q14").title(title_x),
        alt.Y("response_x").title("responses"),
        alt.Color("q13_clustered").title("Percentages").scale(scheme=scheme),
    )
    .properties(
        width=300,
        height=600,
    )
)
# q13_x
q = "q13_clustered != 'Others'"
q13_n = (
    alt.Chart(q13_merged.query(q))
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q14").title("Answers"),
        # alt.Y("response").title("normalized"),
        alt.Y("response").title(None),
        alt.Color("q13_clustered").title("Percentages").scale(scheme=scheme),
        alt.Row("q13_clustered").title("Females in your group"),
    )
    .properties(
        width=300,
        height=100,
    )
)

q13_n.configure_axis(
    labelFontSize=15,
    titleFontSize=15,
).configure_header(labelFontSize=15)
chart = (
    alt.hconcat(
        q13_x,
        q13_n,
    )
    .properties(
        title=question,
    )
    .configure_title(fontSize=20)
    .configure_axis(
        labelFontSize=15,
        titleFontSize=15,
    )
    .configure_header(labelFontSize=15)
)
# chart.save("../../data/main_data/q14-q13_clustered.png")
chart
  • Horizontal axis is “how you think about” and the color denotes the female fraction in your group.

  • Divided into 5 groups depending on the value range of the fractions.

  • < 20% (red & blue): the response tends to be on the dissatisfied side

  • > 30% (purple & orange): the response leans towards the satisfactory side

  • There seems to be a boundary between 20% and 30%.

q13-q03_clustered

# q03_merged
opacity = 0.7
scheme = "set1"
question = "【Q13】What is the percentage of female researcher in your group ?"
title_x = "Female researchers in your group [%]"

q03_x = (
    alt.Chart(q03_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response_x").title("entries"),
        alt.Color("q03_clustered").scale(scheme=scheme),
    )
    .properties(
        width=300,
        height=600,
    )
)
# q03_x
q = "q03_clustered != 'Others'"
q03_n = (
    alt.Chart(q03_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response").title("normalized"),
        alt.Color("q03_clustered:N").scale(scheme=scheme).title("Regions"),
        alt.Row("q03_clustered:N").title("Workplace Regions"),
    )
    .properties(
        width=300,
        height=100,
    )
)
# q03_n
chart = (q03_x | q03_n).properties(title=question).configure_title(fontSize=20)
# chart.save("../../data/main_data/q13-q03_clustered.png")
chart

q13-q04_clustered

opacity = 0.7
scheme = "set1"
question = "【Q13】What is the percentage of female researcher in your group ?"
title_x = "Female researchers in your group [%]"

q04_x = (
    alt.Chart(q04_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response_x").title("entries"),
        alt.Color("q04_clustered").scale(scheme=scheme),
    )
    .properties(
        width=300,
        height=600,
    )
)
# q04_x
q = "q04_clustered != 'Others'"
q04_n = (
    alt.Chart(q04_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response").title("normalized"),
        alt.Color("q04_clustered:N").scale(scheme=scheme).title("Regions"),
        alt.Row("q04_clustered:N").title("Birthplace Regions"),
    )
    .properties(
        width=300,
        height=100,
    )
)
# q03_n
chart = (q04_x | q04_n).properties(title=question).configure_title(fontSize=20)
# chart.save("../../data/main_data/q13-q04_clustered.png")
chart
  • Examined the percentage of female researchers by region.

  • The disparities showed up.

  • The horizontal axis indicates the percentage of female researchers, while the color denotes the region.

  • Asia tends to exhibit a lower ratio of female researchers.