Plots for the ICRC2023 Diversity Session#

Making plots for ICRC2023 Diversity Session

import altair as alt
import pandas as pd
import titanite as ti

print(f"Altair {alt.__version__}")
print(f"Pandas {pd.__version__}")
print(f"Titanite {ti.__version__}")

Altair 5.5.0
Pandas 2.3.0
Titanite 0.6.0

Read data

f_cfg = "../../sandbox/config.toml"
f_csv = "../../data/test_data/prepared_data.csv"
d = ti.Data(read_from=f_csv, load_from=f_cfg)
config = d.config()
data = d.read()
# data

2025-06-23 09:22:50.337 | INFO     | titanite.preprocess:categorical_data:135 - Categorize

q13 の不要な値を削除する

q = "q13 >= 0"
q13_data = data.query(q)

Check correlation with q13 and q14

def heatmap(data: pd.DataFrame, x: str, y: str):
    domain_x = alt.Scale(domain=[-10, 100])

    base = (
        alt.Chart(data)
        .encode(
            alt.Y(y),
        )
        .properties(
            width=400,
            height=400,
        )
    )

    bin0 = base.mark_point().encode(alt.X(x).scale(domain_x))

    bin5 = base.mark_rect().encode(
        alt.X(x).scale(domain_x).bin(step=5),
        alt.Color("count()").scale(scheme="blues"),
    )

    bin10 = base.mark_rect().encode(
        alt.X(x).scale(domain_x).bin(step=5),
        alt.Color("count()").scale(scheme="blues"),
    )

    hm = bin0 & bin5 & bin10
    return hm

heatmap(q13_data, x="q13", y="q14")

q = "q14 != 'Prefer not to answer'"
q14_data = q13_data.query(q)
heatmap(q14_data, x="q13", y="q14")

q = "q03_regional != 'Prefer not to answer'"
q03_data = q13_data.query(q)
heatmap(q03_data, x="q13", y="q03_regional")

q = "q04_regional != 'Prefer not to answer'"
q04_data = q13_data.query(q)
heatmap(q04_data, x="q13", y="q03_regional")

q14の値（割合）でクラスター化する

10 - 20%
20 - 30%
30 - 40%

def cluster_data(data: pd.DataFrame) -> pd.DataFrame:
    copied = data.copy()
    h = "q13_clustered"
    copied[h] = "Others"

    is_lower = copied["q13"] >= 0
    is_upper = copied["q13"] < 10
    isT = is_lower & is_upper
    copied.loc[isT, h] = "0% to 10%"

    is_lower = copied["q13"] >= 10
    is_upper = copied["q13"] < 20
    isT = is_lower & is_upper
    copied.loc[isT, h] = "10% to 20%"

    is_lower = copied["q13"] >= 20
    is_upper = copied["q13"] < 30
    isT = is_lower & is_upper
    copied.loc[isT, h] = "20% to 30%"

    is_lower = copied["q13"] >= 30
    is_upper = copied["q13"] < 40
    isT = is_lower & is_upper
    copied.loc[isT, h] = "30% to 40%"

    is_lower = copied["q13"] >= 40
    is_upper = copied["q13"] < 110
    isT = is_lower & is_upper
    copied.loc[isT, h] = "40% and over"

    h = "q03_clustered"
    copied[h] = "Others"
    isT = copied["q03_regional"] == "Asia"
    copied.loc[isT, h] = "Asia"
    isT = copied["q03_regional"] == "Europe"
    copied.loc[isT, h] = "Europe"
    isT = copied["q03_regional"] == "America"
    copied.loc[isT, h] = "America"

    h = "q04_clustered"
    copied[h] = "Others"
    isT = copied["q04_regional"] == "Asia"
    copied.loc[isT, h] = "Asia"
    isT = copied["q04_regional"] == "Europe"
    copied.loc[isT, h] = "Europe"
    isT = copied["q04_regional"] == "America"
    copied.loc[isT, h] = "America"

    return copied

q13_data = cluster_data(q13_data)
q03_data = cluster_data(q03_data)
q04_data = cluster_data(q04_data)

q13_grouped, h13 = ti.core.hbar(
    q13_data, x="q14", color="q13_clustered", title="q13-q14"
)
q03_grouped, h03 = ti.core.hbar(
    q03_data, x="q13", color="q03_clustered", title="q03-q13"
)
q04_grouped, h04 = ti.core.hbar(
    q04_data, x="q13", color="q04_clustered", title="q04-q13"
)

/home/runner/work/surveys/surveys/titanite/core.py:113: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = data.groupby(g)[c].sum().reset_index()

h13 & h03 & h04

クラスターごとの回答数で規格化する

q13_sum = q13_grouped.groupby("q13_clustered")["response"].sum().reset_index()
q13_merged = pd.merge(q13_grouped, q13_sum, on="q13_clustered")
q13_merged["response"] = q13_merged["response_x"] / q13_merged["response_y"]
# q13_merged

q03_sum = q03_grouped.groupby("q03_clustered")["response"].sum().reset_index()
q03_merged = pd.merge(q03_grouped, q03_sum, on="q03_clustered")
q03_merged["response"] = q03_merged["response_x"] / q03_merged["response_y"]
# q03_merged

q04_sum = q04_grouped.groupby("q04_clustered")["response"].sum().reset_index()
q04_merged = pd.merge(q04_grouped, q04_sum, on="q04_clustered")
q04_merged["response"] = q04_merged["response_x"] / q04_merged["response_y"]
# q04_merged

g, h = ti.core.hbar(q13_merged, x="q14", color="q13_clustered", title="hoge")
h

/home/runner/work/surveys/surveys/titanite/core.py:113: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  grouped = data.groupby(g)[c].sum().reset_index()

_, hbar03 = ti.core.hbar(q03_merged, x="q13", color="q03_clustered", title="hoge")
_, hbar04 = ti.core.hbar(q04_merged, x="q13", color="q04_clustered", title="hoge")
hbar03 & hbar04

hm03 = heatmap(q03_data, x="q13", y="q03_clustered")
hm04 = heatmap(q04_data, x="q13", y="q04_clustered")
hm03 | hm04

Plots for presentation#

X軸-Y軸
q14-q13_clustered
q13-q03_clustered

# q13_merged

opacity = 0.7
# scheme = "category20b"
# scheme = "category20c"
scheme = "set1"
question = "【Q14】What do you think about the percentage of the female researcher in your group ?"
title_x = "What you think"
q = "q13_clustered != 'Others'"
q13_x = (
    alt.Chart(q13_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q14").title(title_x),
        alt.Y("response_x").title("responses"),
        alt.Color("q13_clustered").title("Percentages").scale(scheme=scheme),
    )
    .properties(
        width=300,
        height=600,
    )
)
# q13_x

q = "q13_clustered != 'Others'"
q13_n = (
    alt.Chart(q13_merged.query(q))
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q14").title("Answers"),
        # alt.Y("response").title("normalized"),
        alt.Y("response").title(None),
        alt.Color("q13_clustered").title("Percentages").scale(scheme=scheme),
        alt.Row("q13_clustered").title("Females in your group"),
    )
    .properties(
        width=300,
        height=100,
    )
)

q13_n.configure_axis(
    labelFontSize=15,
    titleFontSize=15,
).configure_header(labelFontSize=15)

chart = (
    alt.hconcat(
        q13_x,
        q13_n,
    )
    .properties(
        title=question,
    )
    .configure_title(fontSize=20)
    .configure_axis(
        labelFontSize=15,
        titleFontSize=15,
    )
    .configure_header(labelFontSize=15)
)
# chart.save("../../data/main_data/q14-q13_clustered.png")
chart

Horizontal axis is “how you think about” and the color denotes the female fraction in your group.
Divided into 5 groups depending on the value range of the fractions.
< 20% (red & blue): the response tends to be on the dissatisfied side
> 30% (purple & orange): the response leans towards the satisfactory side
There seems to be a boundary between 20% and 30%.

q13-q03_clustered

# q03_merged

opacity = 0.7
scheme = "set1"
question = "【Q13】What is the percentage of female researcher in your group ?"
title_x = "Female researchers in your group [%]"

q03_x = (
    alt.Chart(q03_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response_x").title("entries"),
        alt.Color("q03_clustered").scale(scheme=scheme),
    )
    .properties(
        width=300,
        height=600,
    )
)
# q03_x

q = "q03_clustered != 'Others'"
q03_n = (
    alt.Chart(q03_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response").title("normalized"),
        alt.Color("q03_clustered:N").scale(scheme=scheme).title("Regions"),
        alt.Row("q03_clustered:N").title("Workplace Regions"),
    )
    .properties(
        width=300,
        height=100,
    )
)
# q03_n

chart = (q03_x | q03_n).properties(title=question).configure_title(fontSize=20)
# chart.save("../../data/main_data/q13-q03_clustered.png")
chart

q13-q04_clustered

opacity = 0.7
scheme = "set1"
question = "【Q13】What is the percentage of female researcher in your group ?"
title_x = "Female researchers in your group [%]"

q04_x = (
    alt.Chart(q04_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response_x").title("entries"),
        alt.Color("q04_clustered").scale(scheme=scheme),
    )
    .properties(
        width=300,
        height=600,
    )
)
# q04_x

q = "q04_clustered != 'Others'"
q04_n = (
    alt.Chart(q04_merged)
    .mark_bar(opacity=opacity)
    .encode(
        alt.X("q13").bin(step=10).title(title_x),
        alt.Y("response").title("normalized"),
        alt.Color("q04_clustered:N").scale(scheme=scheme).title("Regions"),
        alt.Row("q04_clustered:N").title("Birthplace Regions"),
    )
    .properties(
        width=300,
        height=100,
    )
)
# q03_n

chart = (q04_x | q04_n).properties(title=question).configure_title(fontSize=20)
# chart.save("../../data/main_data/q13-q04_clustered.png")
chart

Examined the percentage of female researchers by region.
The disparities showed up.
The horizontal axis indicates the percentage of female researchers, while the color denotes the region.
Asia tends to exhibit a lower ratio of female researchers.

Plots for the ICRC2023 Diversity Session

Contents

Plots for the ICRC2023 Diversity Session#

Plots for presentation#