linkmapper/app.py at main · boshify/linkmapper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import streamlit as st
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import math

# Function to calculate relevance scores
def calculate_relevance_scores(df, column1, column2):
    vectorizer = TfidfVectorizer()
    # Fix: ensure columns are strings and handle NaNs
    combined_columns = (df[column1].fillna('') + ' ' + df[column2].fillna('')).astype(str)
    tfidf_matrix = vectorizer.fit_transform(combined_columns)
    relevance_scores = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return relevance_scores


# Function to calculate minimum repeat limit needed
def calculate_minimum_repeat_limit(df, link_count):
    total_links_needed = len(df) * link_count
    total_unique_links = len(df)
    return max(1, math.ceil(total_links_needed / total_unique_links))

# Function to ensure every row has links and every link is used fairly
def ensure_no_row_without_links(df, link_usage, repeat_limit, link_count):
    all_urls = df['Full URL'].tolist()

    for idx, row in df.iterrows():
        if pd.isnull(row[f'Link 1 URL']):  # Check if the row has no links
            links_added = 0
            for url in all_urls:
                if url != row['Full URL'] and (link_usage[url] < repeat_limit or links_added < link_count):
                    for i in range(link_count):
                        if pd.isnull(row[f'Link {i+1} URL']):
                            df.at[idx, f'Link {i+1} URL'] = url
                            df.at[idx, f'Link {i+1} Anchor Text'] = df.loc[df['Full URL'] == url, 'Target Keyword'].values[0]
                            link_usage[url] += 1
                            links_added += 1
                            break
                if links_added == link_count:
                    break

# Streamlit UI
st.title("Internal Linking Mapper")

# Step 1: Upload CSV
uploaded_file = st.file_uploader("Upload your CSV", type=["csv"])

if uploaded_file:
    df = pd.read_csv(uploaded_file)

    # Step 2: Map fields
    st.subheader("Map your fields")
    cluster_column = st.selectbox("Select the Cluster column", df.columns)
    hub_spoke_column = st.selectbox("Select the Hub Or Spoke column", df.columns)
    target_keyword_column = st.selectbox("Select the Target Keyword column", df.columns)
    title_tag_column = st.selectbox("Select the Page Title Tag column", df.columns)
    url_column = st.selectbox("Select the Full URL column", df.columns)

    # Step 3: Link Count Slider
    link_count = st.slider("Set Number of Internal Links per Page", min_value=1, max_value=10, value=5)

    # Calculate the minimum repeat limit required
    min_repeat_limit = calculate_minimum_repeat_limit(df, link_count)

    # Step 4: Repeat Limit Slider
    repeat_limit = st.slider("Set Repeat Limit", min_value=1, max_value=10, value=min_repeat_limit)

    # Step 5: Map Every Row? Checkbox
    map_every_row = st.checkbox("Map Every Row?")

    # Warning if the repeat limit is too low
    row_count = df.shape[0]
    st.warning(f"{row_count} rows detected. You need a repeat link limit of at least {min_repeat_limit} to ensure every URL gets {link_count} links.")

    # Step 6: Calculate Relevance Score By
    st.subheader("Calculate Relevance Score By")
    st.write("Select two columns to calculate the relevance score by. Example: Target Keyword Vs Title Tag: This will take the target keyword of a row, and calculate the relevance score vs the title tag of every other row.")

    col1, col2 = st.columns(2)

    with col1:
        relevance_column1 = st.selectbox("Row Calculation", df.columns)

    with col2:
        relevance_column2 = st.selectbox("Column Calculation", df.columns)

    # Step 7: Map Links Button
    if st.button("Map Links"):
        relevance_scores = calculate_relevance_scores(df, relevance_column1, relevance_column2)
        df['Relevance Scores'] = relevance_scores.tolist()

        # Define the correct columns for URLs and Anchor Texts
        for i in range(1, link_count + 1):
            df[f'Link {i} URL'] = ""
            df[f'Link {i} Anchor Text'] = ""

        # Initialize a dictionary to track link usage
        link_usage = {url: 0 for url in df[url_column]}

        st.write("Processing Rows...")

        # Process each row and calculate top 'link_count' links
        for idx, row in df.iterrows():
            title_scores = row['Relevance Scores']
            sorted_scores_idx = sorted(range(len(title_scores)), key=lambda k: (link_usage[df.at[k, url_column]], -title_scores[k]))

            top_links = []
            # First pass: try to use links within their repeat limit
            for link_idx in sorted_scores_idx[1:]:  # Skip the first one as it's the row itself
                url = df.at[link_idx, url_column]
                if url != row['Full URL'] and link_usage[url] < repeat_limit:
                    top_links.append(link_idx)
                    link_usage[url] += 1
                if len(top_links) == link_count:
                    break

            # Second pass: if Map Every Row is checked, ensure every row gets the required number of links, even if it exceeds the repeat limit
            if map_every_row and len(top_links) < link_count:
                for link_idx in sorted_scores_idx[1:]:
                    url = df.at[link_idx, url_column]
                    if url != row['Full URL'] and len(top_links) < link_count:
                        if link_idx not in top_links:
                            top_links.append(link_idx)
                            link_usage[url] += 1

            if row[hub_spoke_column] == "Spoke":
                df.at[idx, 'Hub Link URL'] = df[df[hub_spoke_column] == "Hub"][url_column].values[0]
                df.at[idx, 'Hub Link Anchor Text'] = df[df[hub_spoke_column] == "Hub"][target_keyword_column].values[0]

            for i, link_idx in enumerate(top_links):
                df.at[idx, f'Link {i+1} URL'] = df.at[link_idx, url_column]
                df.at[idx, f'Link {i+1} Anchor Text'] = df.at[link_idx, target_keyword_column]

        # Add the link usage count next to the Full URL
        df['Usage Count'] = df['Full URL'].map(link_usage)

        # Ensure no row is left without links
        ensure_no_row_without_links(df, link_usage, repeat_limit, link_count)

        st.write("Processing Complete!")

        # Step 8: Show the processed DataFrame in the Streamlit UI
        st.dataframe(df)

        # Step 9: Download CSV
        output_file_name = uploaded_file.name.replace(".csv", "") + " - Internal Linking Map.csv"
        st.download_button(label="Download CSV", data=df.to_csv(index=False), file_name=output_file_name)