Plots for the ICRC2023 Diversity Session#
Making plots for ICRC2023 Diversity Session
import altair as alt
import pandas as pd
import titanite as ti
print(f"Altair {alt.__version__}")
print(f"Pandas {pd.__version__}")
print(f"Titanite {ti.__version__}")
Altair 5.5.0
Pandas 2.2.3
Titanite 0.6.0
Read data
f_cfg = "../../sandbox/config.toml"
f_csv = "../../data/test_data/prepared_data.csv"
d = ti.Data(read_from=f_csv, load_from=f_cfg)
config = d.config()
data = d.read()
# data
2025-05-12 10:03:10.189 | INFO | titanite.preprocess:categorical_data:135 - Categorize
q13
の不要な値を削除する
q = "q13 >= 0"
q13_data = data.query(q)
Check correlation with q13
and q14
def heatmap(data: pd.DataFrame, x: str, y: str):
domain_x = alt.Scale(domain=[-10, 100])
base = (
alt.Chart(data)
.encode(
alt.Y(y),
)
.properties(
width=400,
height=400,
)
)
bin0 = base.mark_point().encode(alt.X(x).scale(domain_x))
bin5 = base.mark_rect().encode(
alt.X(x).scale(domain_x).bin(step=5),
alt.Color("count()").scale(scheme="blues"),
)
bin10 = base.mark_rect().encode(
alt.X(x).scale(domain_x).bin(step=5),
alt.Color("count()").scale(scheme="blues"),
)
hm = bin0 & bin5 & bin10
return hm
heatmap(q13_data, x="q13", y="q14")
q = "q14 != 'Prefer not to answer'"
q14_data = q13_data.query(q)
heatmap(q14_data, x="q13", y="q14")
q = "q03_regional != 'Prefer not to answer'"
q03_data = q13_data.query(q)
heatmap(q03_data, x="q13", y="q03_regional")
q = "q04_regional != 'Prefer not to answer'"
q04_data = q13_data.query(q)
heatmap(q04_data, x="q13", y="q03_regional")
q14
の値(割合)でクラスター化する
10 - 20%
20 - 30%
30 - 40%
def cluster_data(data: pd.DataFrame) -> pd.DataFrame:
copied = data.copy()
h = "q13_clustered"
copied[h] = "Others"
is_lower = copied["q13"] >= 0
is_upper = copied["q13"] < 10
isT = is_lower & is_upper
copied.loc[isT, h] = "0% to 10%"
is_lower = copied["q13"] >= 10
is_upper = copied["q13"] < 20
isT = is_lower & is_upper
copied.loc[isT, h] = "10% to 20%"
is_lower = copied["q13"] >= 20
is_upper = copied["q13"] < 30
isT = is_lower & is_upper
copied.loc[isT, h] = "20% to 30%"
is_lower = copied["q13"] >= 30
is_upper = copied["q13"] < 40
isT = is_lower & is_upper
copied.loc[isT, h] = "30% to 40%"
is_lower = copied["q13"] >= 40
is_upper = copied["q13"] < 110
isT = is_lower & is_upper
copied.loc[isT, h] = "40% and over"
h = "q03_clustered"
copied[h] = "Others"
isT = copied["q03_regional"] == "Asia"
copied.loc[isT, h] = "Asia"
isT = copied["q03_regional"] == "Europe"
copied.loc[isT, h] = "Europe"
isT = copied["q03_regional"] == "America"
copied.loc[isT, h] = "America"
h = "q04_clustered"
copied[h] = "Others"
isT = copied["q04_regional"] == "Asia"
copied.loc[isT, h] = "Asia"
isT = copied["q04_regional"] == "Europe"
copied.loc[isT, h] = "Europe"
isT = copied["q04_regional"] == "America"
copied.loc[isT, h] = "America"
return copied
q13_data = cluster_data(q13_data)
q03_data = cluster_data(q03_data)
q04_data = cluster_data(q04_data)
q13_grouped, h13 = ti.core.hbar(
q13_data, x="q14", color="q13_clustered", title="q13-q14"
)
q03_grouped, h03 = ti.core.hbar(
q03_data, x="q13", color="q03_clustered", title="q03-q13"
)
q04_grouped, h04 = ti.core.hbar(
q04_data, x="q13", color="q04_clustered", title="q04-q13"
)
/home/runner/work/surveys/surveys/titanite/core.py:113: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped = data.groupby(g)[c].sum().reset_index()
h13 & h03 & h04
クラスターごとの回答数で規格化する
q13_sum = q13_grouped.groupby("q13_clustered")["response"].sum().reset_index()
q13_merged = pd.merge(q13_grouped, q13_sum, on="q13_clustered")
q13_merged["response"] = q13_merged["response_x"] / q13_merged["response_y"]
# q13_merged
q03_sum = q03_grouped.groupby("q03_clustered")["response"].sum().reset_index()
q03_merged = pd.merge(q03_grouped, q03_sum, on="q03_clustered")
q03_merged["response"] = q03_merged["response_x"] / q03_merged["response_y"]
# q03_merged
q04_sum = q04_grouped.groupby("q04_clustered")["response"].sum().reset_index()
q04_merged = pd.merge(q04_grouped, q04_sum, on="q04_clustered")
q04_merged["response"] = q04_merged["response_x"] / q04_merged["response_y"]
# q04_merged
g, h = ti.core.hbar(q13_merged, x="q14", color="q13_clustered", title="hoge")
h
/home/runner/work/surveys/surveys/titanite/core.py:113: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
grouped = data.groupby(g)[c].sum().reset_index()
_, hbar03 = ti.core.hbar(q03_merged, x="q13", color="q03_clustered", title="hoge")
_, hbar04 = ti.core.hbar(q04_merged, x="q13", color="q04_clustered", title="hoge")
hbar03 & hbar04
hm03 = heatmap(q03_data, x="q13", y="q03_clustered")
hm04 = heatmap(q04_data, x="q13", y="q04_clustered")
hm03 | hm04
Plots for presentation#
X軸-Y軸
q14-q13_clustered
q13-q03_clustered
# q13_merged
opacity = 0.7
# scheme = "category20b"
# scheme = "category20c"
scheme = "set1"
question = "【Q14】What do you think about the percentage of the female researcher in your group ?"
title_x = "What you think"
q = "q13_clustered != 'Others'"
q13_x = (
alt.Chart(q13_merged)
.mark_bar(opacity=opacity)
.encode(
alt.X("q14").title(title_x),
alt.Y("response_x").title("responses"),
alt.Color("q13_clustered").title("Percentages").scale(scheme=scheme),
)
.properties(
width=300,
height=600,
)
)
# q13_x
q = "q13_clustered != 'Others'"
q13_n = (
alt.Chart(q13_merged.query(q))
.mark_bar(opacity=opacity)
.encode(
alt.X("q14").title("Answers"),
# alt.Y("response").title("normalized"),
alt.Y("response").title(None),
alt.Color("q13_clustered").title("Percentages").scale(scheme=scheme),
alt.Row("q13_clustered").title("Females in your group"),
)
.properties(
width=300,
height=100,
)
)
q13_n.configure_axis(
labelFontSize=15,
titleFontSize=15,
).configure_header(labelFontSize=15)
chart = (
alt.hconcat(
q13_x,
q13_n,
)
.properties(
title=question,
)
.configure_title(fontSize=20)
.configure_axis(
labelFontSize=15,
titleFontSize=15,
)
.configure_header(labelFontSize=15)
)
# chart.save("../../data/main_data/q14-q13_clustered.png")
chart
Horizontal axis is “how you think about” and the color denotes the female fraction in your group.
Divided into 5 groups depending on the value range of the fractions.
< 20%
(red & blue): the response tends to be on the dissatisfied side> 30%
(purple & orange): the response leans towards the satisfactory sideThere seems to be a boundary between 20% and 30%.
q13-q03_clustered
# q03_merged
opacity = 0.7
scheme = "set1"
question = "【Q13】What is the percentage of female researcher in your group ?"
title_x = "Female researchers in your group [%]"
q03_x = (
alt.Chart(q03_merged)
.mark_bar(opacity=opacity)
.encode(
alt.X("q13").bin(step=10).title(title_x),
alt.Y("response_x").title("entries"),
alt.Color("q03_clustered").scale(scheme=scheme),
)
.properties(
width=300,
height=600,
)
)
# q03_x
q = "q03_clustered != 'Others'"
q03_n = (
alt.Chart(q03_merged)
.mark_bar(opacity=opacity)
.encode(
alt.X("q13").bin(step=10).title(title_x),
alt.Y("response").title("normalized"),
alt.Color("q03_clustered:N").scale(scheme=scheme).title("Regions"),
alt.Row("q03_clustered:N").title("Workplace Regions"),
)
.properties(
width=300,
height=100,
)
)
# q03_n
chart = (q03_x | q03_n).properties(title=question).configure_title(fontSize=20)
# chart.save("../../data/main_data/q13-q03_clustered.png")
chart
q13-q04_clustered
opacity = 0.7
scheme = "set1"
question = "【Q13】What is the percentage of female researcher in your group ?"
title_x = "Female researchers in your group [%]"
q04_x = (
alt.Chart(q04_merged)
.mark_bar(opacity=opacity)
.encode(
alt.X("q13").bin(step=10).title(title_x),
alt.Y("response_x").title("entries"),
alt.Color("q04_clustered").scale(scheme=scheme),
)
.properties(
width=300,
height=600,
)
)
# q04_x
q = "q04_clustered != 'Others'"
q04_n = (
alt.Chart(q04_merged)
.mark_bar(opacity=opacity)
.encode(
alt.X("q13").bin(step=10).title(title_x),
alt.Y("response").title("normalized"),
alt.Color("q04_clustered:N").scale(scheme=scheme).title("Regions"),
alt.Row("q04_clustered:N").title("Birthplace Regions"),
)
.properties(
width=300,
height=100,
)
)
# q03_n
chart = (q04_x | q04_n).properties(title=question).configure_title(fontSize=20)
# chart.save("../../data/main_data/q13-q04_clustered.png")
chart
Examined the percentage of female researchers by region.
The disparities showed up.
The horizontal axis indicates the percentage of female researchers, while the color denotes the region.
Asia tends to exhibit a lower ratio of female researchers.